In [1]:
import datetime, measures, mmi_txt_to_cui, umls_tables_processing
import os

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

### Constants

In [2]:
PATH_EMBEDDINGS = './Embeddings'

### Extracting all the labels for each CUI of the UMLS vocabulary

In [3]:
dict_conso = umls_tables_processing.cui_strings()

0:07:44


In [4]:
len(dict_conso)

3772450

In [5]:
dict_conso['C1963065']

['Apnea, CTCAE', 'Apnea']

## Building  ***seed_rel***

### Extracting all the CUIs related to COPD (only one hop) 

In [6]:
copd_dict = umls_tables_processing.concepts_related_to_concept()
copd_cuis = list(copd_dict.keys())

256
0:04:07


### Check the semantic types contained into the *seed_rel*

In [7]:
#eval_rel = umls_tables_processing.extracting_stys(copd_cuis)
#t = mmi_txt_to_cui.check_sty_mmi(mmi_txt_to_cui.convert_sty_stymmi(eval_rel))
#print({k: j for k,j in list(t.items())[:5]})

0:38:45
{'dsyn': (105, 'Disease or Syndrome'), 'ftcn': (25, 'Functional Concept'), 'patf': (12, 'Pathologic Function'), 'hlca': (12, 'Health Care Activity'), 'topp': (11, 'Therapeutic or Preventive Procedure')}


In [8]:
seed_rel, _ = umls_tables_processing.extracting_strings(copd_cuis, dict_conso)

0:00:00


## Building ***seed_paper***

In [9]:
paper_cuis = mmi_txt_to_cui.mmi_to_cui(sty = True)

0:00:00


### Check the semantic types contained into the *seed_paper*

In [10]:
a = mmi_txt_to_cui.check_sty_mmi(paper_cuis)
print({k: j for k,j in list(a.items())[:5]})

{'qlco': (49, 'Qualitative Concept'), 'fndg': (38, 'Finding'), 'inpr': (35, 'Intellectual Product'), 'qnco': (34, 'Quantitative Concept'), 'ftcn': (30, 'Functional Concept')}


In [11]:
seed_paper, _ = umls_tables_processing.extracting_strings([i[0] for i in paper_cuis], dict_conso)

0:00:00


## Building ***seed_paper_lite***

In [12]:
paper_lite_cuis, oov_lite = mmi_txt_to_cui.mmi_lite_freetext(sty = True)

0:02:39


In [13]:
paper_lite_cuis_ = list(set([i[0] for i in paper_lite_cuis]))

### Check the semantic types contained into *the seed_paper_lite*

In [14]:
c = mmi_txt_to_cui.check_sty_mmi(paper_lite_cuis)
print({k: j for k,j in list(c.items())[:5]})

{'inpr': (26, 'Intellectual Product'), 'qlco': (21, 'Qualitative Concept'), 'fndg': (20, 'Finding'), 'idcn': (16, 'Idea or Concept'), 'topp': (13, 'Therapeutic or Preventive Procedure')}


In [15]:
oov_lite

['[84–88].\n', '[92, 93]. \n']

In [16]:
seed_paper_lite, _ = umls_tables_processing.extracting_strings(paper_lite_cuis_, dict_conso)

0:00:00


## Loading Models

### Importing the embeddings file names

In [17]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
embeddings = [cuis, words]
embeddings

[('/cuis/',
  ['claims_cuis_hs_300.txt.gz',
   'stanford_cuis_svd_300.txt.gz',
   'cui2vec_pretrained.txt',
   'DeVine_etal_200.txt']),
 ('/words/',
  ['PMC-w2v.bin',
   'Health_2.5mreviews.s200.w10.n5.v15.cbow.bin',
   'PubMed-and-PMC-w2v.bin',
   'GoogleNews-vectors-negative300.bin',
   'PubMed-w2v.bin',
   'tweetsvec.txt',
   'wikipedia-pubmed-and-PMC-w2v.bin'])]

### Creating a list of k values to test

In [18]:
ks = [5, 10, 20, 30, 40]
#ks = [5, 10]

### Creating a list of seed tuples

In [19]:
seeds = [('seed_rel', seed_rel), ('seed_paper', seed_paper), ('seed_paper_lite', seed_paper_lite)]

### Creating a dictionary of all the embeddings tested, over the seeds and k values.

In [20]:
a = datetime.datetime.now().replace(microsecond=0)
big_g = {}

for type_emb in embeddings:
    for emb in type_emb[1]:
        model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
        name = os.path.splitext(emb)[0]
        big_g[name] = {}
        print('\n\n The name of embedding is: %s\n' % name)
        for k in ks:
            print('\n k_value: %s\n' % k)
            big_g[name][k] = {}
            for seed in seeds:
                if type_emb[0]=='/cuis/':
                    d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
                    big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True), measures.neg_dcg(d, normalization = True), measures.percentage_dcg(d), measures.oov(d), len(seed[1]), []]
                elif type_emb[0]=='/words/':
                    d, new_seed = measures.occurred_labels(model, seed[1], k_most_similar=k)
                    big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True), measures.neg_dcg(d, normalization = True), measures.percentage_dcg(d), measures.oov(d), len(seed[1]), new_seed]
                print('{:s}: pos_dcg: {:.4f}, neg_dcg: {:.4f}, perc_dcg: {:.4f}, oov: {:d}, #seed: {:d}\n'.
                      format(seed[0],
                             measures.pos_dcg(d, normalization = True),
                             measures.neg_dcg(d, normalization = True),
                             measures.percentage_dcg(d),
                             measures.oov(d),
                             len(seed[1])))
print(datetime.datetime.now().replace(microsecond=0)-a)        



 The name of embedding is: claims_cuis_hs_300.txt


 k_value: 5

seed_rel: pos_dcg: 0.0566, neg_dcg: 2.8918, perc_dcg: 0.0781, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 2.9485, perc_dcg: 0.0000, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0032, neg_dcg: 2.9453, perc_dcg: 0.0064, oov: 154, #seed: 157


 k_value: 10

seed_rel: pos_dcg: 0.0641, neg_dcg: 4.4794, perc_dcg: 0.1016, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 4.5436, perc_dcg: 0.0000, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0032, neg_dcg: 4.5404, perc_dcg: 0.0064, oov: 154, #seed: 157


 k_value: 20

seed_rel: pos_dcg: 0.0804, neg_dcg: 6.9599, perc_dcg: 0.1680, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0012, neg_dcg: 7.0391, perc_dcg: 0.0050, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0032, neg_dcg: 7.0371, perc_dcg: 0.0064, oov: 154, #seed: 157


 k_value: 30

seed_rel: pos_dcg: 0.0969, neg_dcg: 9.0647, perc_dcg: 0.2461, oov: 209, #seed: 256

seed_paper: pos_dcg: 

  return (m / dist).astype(REAL)


seed_rel: pos_dcg: 0.0000, neg_dcg: 2.9485, perc_dcg: 0.0000, oov: 256, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 2.9485, perc_dcg: 0.0000, oov: 399, #seed: 399

seed_paper_lite: pos_dcg: 0.0000, neg_dcg: 2.9485, perc_dcg: 0.0000, oov: 157, #seed: 157


 k_value: 10

seed_rel: pos_dcg: 0.0000, neg_dcg: 4.5436, perc_dcg: 0.0000, oov: 256, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 4.5436, perc_dcg: 0.0000, oov: 399, #seed: 399

seed_paper_lite: pos_dcg: 0.0000, neg_dcg: 4.5436, perc_dcg: 0.0000, oov: 157, #seed: 157


 k_value: 20

seed_rel: pos_dcg: 0.0000, neg_dcg: 7.0403, perc_dcg: 0.0000, oov: 256, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 7.0403, perc_dcg: 0.0000, oov: 399, #seed: 399

seed_paper_lite: pos_dcg: 0.0000, neg_dcg: 7.0403, perc_dcg: 0.0000, oov: 157, #seed: 157


 k_value: 30

seed_rel: pos_dcg: 0.0000, neg_dcg: 9.1616, perc_dcg: 0.0000, oov: 256, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 9.1616, perc_dcg: 0.0000, oov: 399, #seed: 399

se

0:01:23
seed_rel: pos_dcg: 0.5444, neg_dcg: 8.6172, perc_dcg: 1.2617, oov: 136, #seed: 256

0:03:25
seed_paper: pos_dcg: 1.2725, neg_dcg: 7.8890, perc_dcg: 2.9424, oov: 46, #seed: 399

0:00:59
seed_paper_lite: pos_dcg: 0.8521, neg_dcg: 8.3095, perc_dcg: 1.8408, oov: 18, #seed: 157


 k_value: 40

0:01:21
seed_rel: pos_dcg: 0.5734, neg_dcg: 10.5176, perc_dcg: 1.4141, oov: 136, #seed: 256

0:03:27
seed_paper: pos_dcg: 1.3523, neg_dcg: 9.7387, perc_dcg: 3.3759, oov: 46, #seed: 399

0:01:03
seed_paper_lite: pos_dcg: 0.8793, neg_dcg: 10.2117, perc_dcg: 1.9809, oov: 18, #seed: 157



 The name of embedding is: GoogleNews-vectors-negative300


 k_value: 5

0:01:12
seed_rel: pos_dcg: 0.2514, neg_dcg: 2.6970, perc_dcg: 0.3516, oov: 146, #seed: 256

0:03:02
seed_paper: pos_dcg: 0.8296, neg_dcg: 2.1189, perc_dcg: 1.2080, oov: 62, #seed: 399

0:00:58
seed_paper_lite: pos_dcg: 0.6567, neg_dcg: 2.2918, perc_dcg: 0.9554, oov: 20, #seed: 157


 k_value: 10

0:00:56
seed_rel: pos_dcg: 0.2893, neg_dcg: 

In [21]:
import pandas as pd
import numpy as np

df_ks = []
for k in ks:
    names = list(big_g.keys())
    seeds = list(big_g[name][k].keys())
    header = [np.array([k]*15),
              np.array([seeds[0],seeds[0],seeds[0],seeds[0],seeds[0],
                        seeds[1],seeds[1],seeds[1],seeds[1],seeds[1],
                        seeds[2],seeds[2],seeds[2],seeds[2],seeds[2]]), 
              np.array(['pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed'])] 
    tmp = [ ['%.3f'%(big_g[name][k][seed][0]),'%.3f'%(big_g[name][k][seed][1]),
             '%.3f'%(big_g[name][k][seed][2]), '%d'%(big_g[name][k][seed][3]),
             '%d'%(big_g[name][k][seed][4])] for name in names for seed in seeds]
    df_ks.append(pd.DataFrame(np.reshape(tmp, (11, 15)), index=np.array(names), columns = header))

In [22]:
df_ks[0]

Unnamed: 0_level_0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,0.057,2.892,0.078,209,256,0.0,2.948,0.0,389,399,0.003,2.945,0.006,154,157
stanford_cuis_svd_300.txt,0.0,2.948,0.0,256,256,0.0,2.948,0.0,399,399,0.0,2.948,0.0,157,157
cui2vec_pretrained,0.132,2.816,0.203,120,256,0.163,2.785,0.273,214,399,0.046,2.903,0.083,102,157
DeVine_etal_200,0.128,2.82,0.203,136,256,0.157,2.792,0.241,226,399,0.059,2.889,0.07,106,157
PMC-w2v,0.304,2.645,0.43,137,256,0.812,2.136,1.223,47,399,0.574,2.374,0.834,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.121,2.827,0.199,181,256,0.264,2.685,0.416,207,399,0.172,2.776,0.248,91,157
PubMed-and-PMC-w2v,0.363,2.586,0.523,136,256,0.861,2.088,1.268,46,399,0.604,2.344,0.86,18,157
GoogleNews-vectors-negative300,0.251,2.697,0.352,146,256,0.83,2.119,1.208,62,399,0.657,2.292,0.955,20,157
PubMed-w2v,0.341,2.608,0.465,136,256,0.862,2.086,1.281,46,399,0.729,2.22,0.987,18,157
tweetsvec,0.062,2.886,0.094,193,256,0.103,2.845,0.15,231,399,0.041,2.908,0.064,98,157


In [23]:
df_ks[1]

Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,0.064,4.479,0.102,209,256,0.0,4.544,0.0,389,399,0.003,4.54,0.006,154,157
stanford_cuis_svd_300.txt,0.0,4.544,0.0,256,256,0.0,4.544,0.0,399,399,0.0,4.544,0.0,157,157
cui2vec_pretrained,0.182,4.362,0.359,120,256,0.219,4.325,0.446,214,399,0.058,4.485,0.121,102,157
DeVine_etal_200,0.167,4.376,0.324,136,256,0.182,4.362,0.321,226,399,0.071,4.472,0.108,106,157
PMC-w2v,0.384,4.16,0.695,137,256,0.981,3.563,1.772,47,399,0.667,3.876,1.115,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.146,4.398,0.277,181,256,0.327,4.216,0.619,207,399,0.211,4.332,0.389,91,157
PubMed-and-PMC-w2v,0.425,4.118,0.742,136,256,0.995,3.549,1.702,46,399,0.676,3.867,1.083,18,157
GoogleNews-vectors-negative300,0.289,4.254,0.465,146,256,0.947,3.597,1.574,62,399,0.751,3.793,1.248,20,157
PubMed-w2v,0.404,4.14,0.684,136,256,0.988,3.556,1.687,46,399,0.811,3.732,1.248,18,157
tweetsvec,0.084,4.459,0.164,193,256,0.142,4.401,0.273,231,399,0.052,4.491,0.102,98,157


In [24]:
df_ks[2]

Unnamed: 0_level_0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,0.08,6.96,0.168,209,256,0.001,7.039,0.005,389,399,0.003,7.037,0.006,154,157
stanford_cuis_svd_300.txt,0.0,7.04,0.0,256,256,0.0,7.04,0.0,399,399,0.0,7.04,0.0,157,157
cui2vec_pretrained,0.247,6.794,0.617,120,256,0.275,6.766,0.669,214,399,0.069,6.971,0.166,102,157
DeVine_etal_200,0.241,6.799,0.617,136,256,0.225,6.815,0.494,226,399,0.078,6.962,0.134,106,157
PMC-w2v,0.463,6.578,1.039,137,256,1.19,5.85,2.644,47,399,0.828,6.213,1.752,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.183,6.857,0.43,181,256,0.4,6.641,0.93,207,399,0.248,6.792,0.541,91,157
PubMed-and-PMC-w2v,0.5,6.541,1.047,136,256,1.169,5.872,2.426,46,399,0.796,6.244,1.573,18,157
GoogleNews-vectors-negative300,0.326,6.715,0.629,146,256,1.062,5.978,2.063,62,399,0.805,6.236,1.484,20,157
PubMed-w2v,0.467,6.573,0.953,136,256,1.152,5.888,2.378,46,399,0.91,6.13,1.637,18,157
tweetsvec,0.119,6.921,0.305,193,256,0.18,6.86,0.426,231,399,0.066,6.974,0.166,98,157


In [25]:
df_ks[3]

Unnamed: 0_level_0,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,0.097,9.065,0.246,209,256,0.002,9.16,0.008,389,399,0.003,9.158,0.006,154,157
stanford_cuis_svd_300.txt,0.0,9.162,0.0,256,256,0.0,9.162,0.0,399,399,0.0,9.162,0.0,157,157
cui2vec_pretrained,0.304,8.858,0.887,120,256,0.326,8.835,0.912,214,399,0.085,9.076,0.242,102,157
DeVine_etal_200,0.289,8.872,0.844,136,256,0.257,8.905,0.644,226,399,0.086,9.075,0.172,106,157
PMC-w2v,0.51,8.652,1.281,137,256,1.322,7.84,3.308,47,399,0.869,8.293,1.955,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.207,8.955,0.547,181,256,0.445,8.717,1.148,207,399,0.263,8.899,0.618,91,157
PubMed-and-PMC-w2v,0.544,8.617,1.262,136,256,1.273,7.889,2.942,46,399,0.852,8.31,1.841,18,157
GoogleNews-vectors-negative300,0.346,8.816,0.727,146,256,1.131,8.031,2.409,62,399,0.832,8.329,1.624,20,157
PubMed-w2v,0.5,8.662,1.113,136,256,1.254,7.908,2.885,46,399,0.953,8.209,1.86,18,157
tweetsvec,0.138,9.024,0.391,193,256,0.21,8.951,0.574,231,399,0.071,9.091,0.185,98,157


In [26]:
df_ks[4]

Unnamed: 0_level_0,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,0.109,10.982,0.309,209,256,0.003,11.088,0.013,389,399,0.003,11.088,0.006,154,157
stanford_cuis_svd_300.txt,0.0,11.091,0.0,256,256,0.0,11.091,0.0,399,399,0.0,11.091,0.0,157,157
cui2vec_pretrained,0.353,10.738,1.141,120,256,0.358,10.733,1.073,214,399,0.094,10.997,0.287,102,157
DeVine_etal_200,0.327,10.764,1.039,136,256,0.277,10.814,0.749,226,399,0.09,11.001,0.191,106,157
PMC-w2v,0.537,10.554,1.422,137,256,1.437,9.654,3.935,47,399,0.904,10.187,2.14,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.226,10.866,0.652,181,256,0.475,10.616,1.313,207,399,0.279,10.812,0.701,91,157
PubMed-and-PMC-w2v,0.573,10.518,1.414,136,256,1.352,9.739,3.376,46,399,0.879,10.212,1.981,18,157
GoogleNews-vectors-negative300,0.357,10.734,0.785,146,256,1.189,9.902,2.722,62,399,0.85,10.241,1.732,20,157
PubMed-w2v,0.529,10.562,1.266,136,256,1.336,9.755,3.313,46,399,0.999,10.092,2.102,18,157
tweetsvec,0.157,10.934,0.492,193,256,0.235,10.856,0.702,231,399,0.083,11.008,0.248,98,157
