In [1]:
import datetime, measures, mmi_txt_to_cui, umls_tables_processing
import os

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

### Constants

In [2]:
PATH_EMBEDDINGS = './Embeddings'

### Extracting all the labels for each CUI of the UMLS vocabulary

In [3]:
dict_conso = umls_tables_processing.cui_strings()

0:00:53


In [4]:
len(dict_conso)

3772450

In [5]:
dict_conso['C1963065']

['Apnea, CTCAE', 'Apnea']

### Extracting all the CUIs related to COPD (only one hop) 

In [6]:
copd_cuis = umls_tables_processing.related_cuis_concept()

256
0:03:32


In [7]:
len(copd_cuis)

256

## Building  ***seed_rel***

In [8]:
seed_rel, _ = umls_tables_processing.extracting_strings(copd_cuis, dict_conso)

0:00:00


## Building ***seed_paper***

In [9]:
paper_cuis = mmi_txt_to_cui.mmi_to_cui()

0:00:00


In [10]:
seed_paper, _ = umls_tables_processing.extracting_strings(paper_cuis, dict_conso)

0:00:00


## Building ***seed_paper_lite***

In [11]:
paper_lite_cuis, oov_lite = mmi_txt_to_cui.mmi_lite_freetext()

0:02:37


In [12]:
paper_lite_cuis = list(set(paper_lite_cuis))

In [13]:
oov_lite

['[84–88].\n', '[92, 93]. \n']

In [14]:
seed_paper_lite, _ = umls_tables_processing.extracting_strings(paper_lite_cuis, dict_conso)

0:00:00


## Loading Models

In [15]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if f.is_file()])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if f.is_file()])
embeddings = [cuis, words]
embeddings

[('/cuis/',
  ['claims_cuis_hs_300.txt.gz',
   'stanford_cuis_svd_300.txt.gz',
   'cui2vec_pretrained.txt',
   'DeVine_etal_200.txt']),
 ('/words/',
  ['PMC-w2v.bin',
   'Health_2.5mreviews.s200.w10.n5.v15.cbow.bin',
   'PubMed-and-PMC-w2v.bin',
   'GoogleNews-vectors-negative300.bin',
   'PubMed-w2v.bin',
   'tweetsvec.txt',
   'wikipedia-pubmed-and-PMC-w2v.bin'])]

In [16]:
ks = [5, 10, 20, 30, 40]

In [17]:
seeds = [('seed_rel', seed_rel), ('seed_paper', seed_paper), ('seed_paper_lite', seed_paper_lite)]

In [19]:
a = datetime.datetime.now().replace(microsecond=0)
big_g = {}

for k in ks:
    big_g[k] = {}
    for type_emb in embeddings:
        for emb in type_emb[1]:
            model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
            name = os.path.splitext(emb)[0]
            big_g[k][name] = {}
            for seed in seeds:
                if type_emb[0]=='/cuis/':
                    d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
                elif type_emb[0]=='/words/':
                    d = measures.occurred_labels(model, seed[1], k_most_similar=k)
                big_g[k][name][seed[0]] = [measures.pos_dcg(d), measures.neg_dcg(d), measures.percentage_dcg(d), measures.oov(d)]
print(datetime.datetime.now().replace(microsecond=0)-a)        

256
399
157
256
399
157


  return (m / dist).astype(REAL)


256
399
157
256
399
157
0:00:51
0:01:57
0:00:36
0:00:01
0:00:02
0:00:00
0:01:30
0:03:18
0:00:58
0:01:10
0:03:14
0:01:03
0:00:48
0:01:46
0:00:32
0:00:01
0:00:00
0:00:00
0:02:06
0:04:50
0:01:28
256
399
157
256
399
157
256
399
157
256
399
157
0:00:50
0:02:00
0:00:38
0:00:01
0:00:02
0:00:00
0:01:33
0:03:24
0:01:03
0:01:11
0:03:21
0:01:03
0:00:50
0:01:54
0:00:35
0:00:01
0:00:00
0:00:00
0:02:13
0:04:52
0:01:31
256
399
157
256
399
157
256
399
157
256
399
157
0:00:54
0:02:05
0:00:39
0:00:01
0:00:02
0:00:01
0:01:33
0:03:19
0:01:01
0:01:09
0:03:15
0:01:02
0:00:52
0:02:00
0:00:39
0:00:01
0:00:01
0:00:00
0:02:13
0:04:55
0:01:30
256
399
157
256
399
157
256
399
157
256
399
157
0:00:56
0:02:04
0:00:39
0:00:02
0:00:03
0:00:00
0:01:34
0:03:31
0:01:04
0:01:18
0:03:29
0:01:06
0:00:50
0:01:56
0:00:35
0:00:00
0:00:01
0:00:00
0:02:09
0:04:57
0:01:32
256
399
157
256
399
157
256
399
157
256
399
157
0:00:51
0:02:05
0:00:39
0:00:01
0:00:03
0:00:00
0:01:32
0:03:28
0:01:02
0:01:17
0:03:27
0:01:05
0:00:49
0:01:56


In [21]:
big_g

{5: {'claims_cuis_hs_300.txt': {'seed_rel': [14.501806801297391,
    740.3037276318257,
    0.015625,
    209],
   'seed_paper': [0, 1176.4351884328785, 0.0, 389],
   'seed_paper_lite': [0.5, 462.4080816640675, 0.0012738853503184713, 154]},
  'stanford_cuis_svd_300.txt': {'seed_rel': [0, 754.8055344331227, 0.0, 256],
   'seed_paper': [0, 1176.4351884328785, 0.0, 399],
   'seed_paper_lite': [0, 462.9080816640675, 0.0, 157]},
  'cui2vec_pretrained': {'seed_rel': [33.8234912681452,
    720.9820431649783,
    0.040625,
    120],
   'seed_paper': [65.16045246531382,
    1111.2747359675607,
    0.05463659147869674,
    214],
   'seed_paper_lite': [7.214447603066719,
    455.69363406100064,
    0.016560509554140127,
    102]},
  'DeVine_etal_200': {'seed_rel': [32.88509757979005,
    721.9204368533334,
    0.040625,
    136],
   'seed_paper': [62.47924848325011,
    1113.9559399496247,
    0.0481203007518797,
    226],
   'seed_paper_lite': [9.279642067948915,
    453.6284395961184,
    0.014

In [26]:
import utils
utils.save_txt_dicts(big_g, 'big_g.txt')