In [1]:
import datetime, measures, mmi_txt_to_cui, umls_tables_processing
import os

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

### Constants

In [2]:
PATH_EMBEDDINGS = './Embeddings'

### Extracting all the labels for each CUI of the UMLS vocabulary

In [3]:
dict_conso = umls_tables_processing.cui_strings()

0:00:45


In [4]:
len(dict_conso)

3772450

In [5]:
dict_conso['C1963065']

['Apnea, CTCAE', 'Apnea']

## Building  ***seed_rel***

### Extracting all the CUIs related to COPD (only one hop) 

In [6]:
copd_cuis = umls_tables_processing.related_cuis_concept()

256
0:03:27


In [8]:
seed_rel, _ = umls_tables_processing.extracting_strings(copd_cuis, dict_conso)

0:00:00


## Building ***seed_paper***

In [9]:
paper_cuis = mmi_txt_to_cui.mmi_to_cui()

0:00:00


In [10]:
seed_paper, _ = umls_tables_processing.extracting_strings(paper_cuis, dict_conso)

0:00:00


## Building ***seed_paper_lite***

In [11]:
paper_lite_cuis, oov_lite = mmi_txt_to_cui.mmi_lite_freetext()

0:02:33


In [12]:
paper_lite_cuis = list(set(paper_lite_cuis))

In [13]:
oov_lite

['[84–88].\n', '[92, 93]. \n']

In [14]:
seed_paper_lite, _ = umls_tables_processing.extracting_strings(paper_lite_cuis, dict_conso)

0:00:00


## Loading Models

### Importing the embeddings file names

In [15]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
embeddings = [cuis, words]
embeddings

[('/cuis/',
  ['claims_cuis_hs_300.txt.gz',
   'stanford_cuis_svd_300.txt.gz',
   'cui2vec_pretrained.txt',
   'DeVine_etal_200.txt']),
 ('/words/',
  ['PMC-w2v.bin',
   'Health_2.5mreviews.s200.w10.n5.v15.cbow.bin',
   'PubMed-and-PMC-w2v.bin',
   'GoogleNews-vectors-negative300.bin',
   'PubMed-w2v.bin',
   'tweetsvec.txt',
   'wikipedia-pubmed-and-PMC-w2v.bin'])]

### Creating a list of k values to test

In [16]:
ks = [5, 10, 20, 30, 40]
#ks = [5]

### Creating a list of seed tuples

In [17]:
seeds = [('seed_rel', seed_rel), ('seed_paper', seed_paper), ('seed_paper_lite', seed_paper_lite)]

### Creating a dictionary of all the embeddings tested, over the seeds and k values.

In [18]:
a = datetime.datetime.now().replace(microsecond=0)
big_g = {}

for k in ks:
    big_g[k] = {}
    print('\n\n k_value: %s\n' % k)
    for type_emb in embeddings:
        for emb in type_emb[1]:
            model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
            name = os.path.splitext(emb)[0]
            big_g[k][name] = {}
            print('\n The name of embedding is: %s\n' % name)
            for seed in seeds:
                if type_emb[0]=='/cuis/':
                    d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
                elif type_emb[0]=='/words/':
                    d = measures.occurred_labels(model, seed[1], k_most_similar=k)
                print('{:s}: pos_dcg: {:.2f}, neg_dcg: {:.2f}, perc_dcg: {:.4f}, oov: {:d}, #seed: {:d}\n'.format(seed[0], measures.pos_dcg(d), measures.neg_dcg(d), measures.percentage_dcg(d), measures.oov(d), len(seed[1])))
                big_g[k][name][seed[0]] = [measures.pos_dcg(d), measures.neg_dcg(d), measures.percentage_dcg(d), measures.oov(d), len(seed[1])]
print(datetime.datetime.now().replace(microsecond=0)-a)        



 k_value: 5


 The name of embedding is: claims_cuis_hs_300.txt

seed_rel: pos_dcg: 14.50, neg_dcg: 740.30, perc_dcg: 0.0156, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.00, neg_dcg: 1176.44, perc_dcg: 0.0000, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.50, neg_dcg: 462.41, perc_dcg: 0.0013, oov: 154, #seed: 157


 The name of embedding is: stanford_cuis_svd_300.txt

seed_rel: pos_dcg: 0.00, neg_dcg: 754.81, perc_dcg: 0.0000, oov: 256, #seed: 256

seed_paper: pos_dcg: 0.00, neg_dcg: 1176.44, perc_dcg: 0.0000, oov: 399, #seed: 399

seed_paper_lite: pos_dcg: 0.00, neg_dcg: 462.91, perc_dcg: 0.0000, oov: 157, #seed: 157



  return (m / dist).astype(REAL)



 The name of embedding is: cui2vec_pretrained

seed_rel: pos_dcg: 33.82, neg_dcg: 720.98, perc_dcg: 0.0406, oov: 120, #seed: 256

seed_paper: pos_dcg: 65.16, neg_dcg: 1111.27, perc_dcg: 0.0546, oov: 214, #seed: 399

seed_paper_lite: pos_dcg: 7.21, neg_dcg: 455.69, perc_dcg: 0.0166, oov: 102, #seed: 157


 The name of embedding is: DeVine_etal_200

seed_rel: pos_dcg: 32.89, neg_dcg: 721.92, perc_dcg: 0.0406, oov: 136, #seed: 256

seed_paper: pos_dcg: 62.48, neg_dcg: 1113.96, perc_dcg: 0.0481, oov: 226, #seed: 399

seed_paper_lite: pos_dcg: 9.28, neg_dcg: 453.63, perc_dcg: 0.0140, oov: 106, #seed: 157


 The name of embedding is: PMC-w2v

0:00:49
seed_rel: pos_dcg: 77.78, neg_dcg: 677.03, perc_dcg: 0.0859, oov: 179, #seed: 256

0:01:55
seed_paper: pos_dcg: 324.07, neg_dcg: 852.37, perc_dcg: 0.2446, oov: 124, #seed: 399

0:00:38
seed_paper_lite: pos_dcg: 90.19, neg_dcg: 372.71, perc_dcg: 0.1669, oov: 64, #seed: 157


 The name of embedding is: Health_2.5mreviews.s200.w10.n5.v15.cbow

0:0

0:00:02
seed_paper: pos_dcg: 159.44, neg_dcg: 2649.63, perc_dcg: 0.0465, oov: 249, #seed: 399

0:00:01
seed_paper_lite: pos_dcg: 38.90, neg_dcg: 1066.42, perc_dcg: 0.0271, oov: 114, #seed: 157


 The name of embedding is: PubMed-and-PMC-w2v

0:01:35
seed_rel: pos_dcg: 127.93, neg_dcg: 1674.38, perc_dcg: 0.0523, oov: 160, #seed: 256

0:03:34
seed_paper: pos_dcg: 466.24, neg_dcg: 2342.83, perc_dcg: 0.1213, oov: 90, #seed: 399

0:01:06
seed_paper_lite: pos_dcg: 125.04, neg_dcg: 980.29, perc_dcg: 0.0787, oov: 43, #seed: 157


 The name of embedding is: GoogleNews-vectors-negative300

0:01:14
seed_rel: pos_dcg: 83.35, neg_dcg: 1718.96, perc_dcg: 0.0314, oov: 185, #seed: 256

0:03:24
seed_paper: pos_dcg: 423.92, neg_dcg: 2385.14, perc_dcg: 0.1031, oov: 116, #seed: 399

0:01:05
seed_paper_lite: pos_dcg: 126.35, neg_dcg: 978.97, perc_dcg: 0.0742, oov: 47, #seed: 157


 The name of embedding is: PubMed-w2v

0:00:51
seed_rel: pos_dcg: 119.53, neg_dcg: 1682.78, perc_dcg: 0.0477, oov: 164, #seed: 

0:02:01
seed_paper: pos_dcg: 533.22, neg_dcg: 3892.10, perc_dcg: 0.0828, oov: 79, #seed: 399

0:00:36
seed_paper_lite: pos_dcg: 156.85, neg_dcg: 1584.44, perc_dcg: 0.0525, oov: 36, #seed: 157


 The name of embedding is: tweetsvec

0:00:01
seed_rel: pos_dcg: 40.23, neg_dcg: 2799.07, perc_dcg: 0.0123, oov: 215, #seed: 256

0:00:01
seed_paper: pos_dcg: 93.78, neg_dcg: 4331.54, perc_dcg: 0.0175, oov: 276, #seed: 399

0:00:00
seed_paper_lite: pos_dcg: 13.03, neg_dcg: 1728.26, perc_dcg: 0.0062, oov: 132, #seed: 157


 The name of embedding is: wikipedia-pubmed-and-PMC-w2v

0:02:13
seed_rel: pos_dcg: 138.09, neg_dcg: 2701.22, perc_dcg: 0.0337, oov: 156, #seed: 256

0:04:53
seed_paper: pos_dcg: 523.94, neg_dcg: 3901.38, perc_dcg: 0.0833, oov: 83, #seed: 399

0:01:30
seed_paper_lite: pos_dcg: 133.45, neg_dcg: 1607.85, perc_dcg: 0.0454, oov: 40, #seed: 157

3:02:48


In [54]:
import pandas as pd
import numpy as np

df_ks = []
for k in big_g.keys():
    names = list(big_g[k].keys())
    seeds = list(big_g[k][names[0]].keys())
    header = [np.array([k]*15),
              np.array([seeds[0],seeds[0],seeds[0],seeds[0],seeds[0],
                        seeds[1],seeds[1],seeds[1],seeds[1],seeds[1],
                        seeds[2],seeds[2],seeds[2],seeds[2],seeds[2]]), 
              np.array(['pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'oov', '#seed'])] 
    tmp = [ ['%.2f'%(big_g[k][name][seed][0]),'%.2f'%(big_g[k][name][seed][1]),
             '%.4f'%(big_g[k][name][seed][2]), '%d'%(big_g[k][name][seed][3]),
             '%d'%(big_g[k][name][seed][4])] for name in names for seed in seeds]
    df_ks.append(pd.DataFrame(np.reshape(tmp, (11, 15)), index=np.array(names), columns = header))

In [57]:
df_ks[0]

Unnamed: 0_level_0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,14.5,740.3,0.0156,209,256,0.0,1176.44,0.0,389,399,0.5,462.41,0.0013,154,157
stanford_cuis_svd_300.txt,0.0,754.81,0.0,256,256,0.0,1176.44,0.0,399,399,0.0,462.91,0.0,157,157
cui2vec_pretrained,33.82,720.98,0.0406,120,256,65.16,1111.27,0.0546,214,399,7.21,455.69,0.0166,102,157
DeVine_etal_200,32.89,721.92,0.0406,136,256,62.48,1113.96,0.0481,226,399,9.28,453.63,0.014,106,157
PMC-w2v,77.78,677.03,0.0859,179,256,324.07,852.37,0.2446,124,399,90.19,372.71,0.1669,64,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,31.09,723.71,0.0398,220,256,105.14,1071.29,0.0832,291,399,27.04,435.87,0.0497,128,157
PubMed-and-PMC-w2v,92.83,661.98,0.1047,169,256,343.41,833.02,0.2536,120,399,94.88,368.03,0.172,58,157
GoogleNews-vectors-negative300,64.37,690.44,0.0703,193,256,331.0,845.44,0.2416,138,399,103.09,359.81,0.1911,66,157
PubMed-w2v,87.28,667.53,0.093,171,256,344.0,832.43,0.2561,109,399,114.43,348.48,0.1975,51,157
tweetsvec,15.89,738.92,0.0187,243,256,41.26,1135.17,0.0301,343,399,6.39,456.52,0.0127,145,157


In [58]:
df_ks[1]

Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,16.41,1146.74,0.0102,209,256,0.0,1812.88,0.0,389,399,0.5,712.84,0.0006,154,157
stanford_cuis_svd_300.txt,0.0,1163.15,0.0,256,256,0.0,1812.88,0.0,399,399,0.0,713.34,0.0,157,157
cui2vec_pretrained,46.54,1116.61,0.0359,120,256,87.22,1725.66,0.0446,214,399,9.16,704.18,0.0121,102,157
DeVine_etal_200,42.85,1120.3,0.0324,136,256,72.6,1740.28,0.0321,226,399,11.21,702.13,0.0108,106,157
PMC-w2v,98.22,1064.93,0.0695,163,256,391.25,1421.63,0.1772,96,399,104.79,608.55,0.1115,60,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,37.33,1125.82,0.0277,213,256,130.59,1682.29,0.0619,266,399,33.16,680.18,0.0389,122,157
PubMed-and-PMC-w2v,108.89,1054.26,0.0742,166,256,396.85,1416.03,0.1702,98,399,106.15,607.19,0.1083,52,157
GoogleNews-vectors-negative300,74.07,1089.09,0.0465,188,256,377.75,1435.13,0.1574,125,399,117.86,595.48,0.1248,56,157
PubMed-w2v,103.37,1059.78,0.0684,169,256,394.03,1418.85,0.1687,97,399,127.39,585.94,0.1248,46,157
tweetsvec,21.56,1141.59,0.0164,233,256,56.79,1756.09,0.0273,321,399,8.22,705.12,0.0102,139,157


In [59]:
df_ks[2]

Unnamed: 0_level_0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,20.58,1781.73,0.0084,209,256,0.48,2808.59,0.0003,389,399,0.5,1104.82,0.0003,154,157
stanford_cuis_svd_300.txt,0.0,1802.31,0.0,256,256,0.0,2809.07,0.0,399,399,0.0,1105.32,0.0,157,157
cui2vec_pretrained,63.11,1739.19,0.0309,120,256,109.59,2699.48,0.0335,214,399,10.86,1094.46,0.0083,102,157
DeVine_etal_200,61.73,1740.58,0.0309,136,256,89.8,2719.27,0.0247,226,399,12.26,1093.06,0.0067,106,157
PMC-w2v,118.43,1683.88,0.052,159,256,474.88,2334.18,0.1322,83,399,129.94,975.38,0.0876,50,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,46.88,1755.43,0.0215,205,256,159.44,2649.63,0.0465,249,399,38.9,1066.42,0.0271,114,157
PubMed-and-PMC-w2v,127.93,1674.38,0.0523,160,256,466.24,2342.83,0.1213,90,399,125.04,980.29,0.0787,43,157
GoogleNews-vectors-negative300,83.35,1718.96,0.0314,185,256,423.92,2385.14,0.1031,116,399,126.35,978.97,0.0742,47,157
PubMed-w2v,119.53,1682.78,0.0477,164,256,459.77,2349.3,0.1189,87,399,142.89,962.43,0.0818,46,157
tweetsvec,30.55,1771.76,0.0152,225,256,71.74,2737.32,0.0213,301,399,10.42,1094.9,0.0083,137,157


In [60]:
df_ks[3]

Unnamed: 0_level_0,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,24.81,2320.56,0.0082,209,256,0.7,3654.77,0.0003,389,399,0.5,1437.87,0.0002,154,157
stanford_cuis_svd_300.txt,0.0,2345.36,0.0,256,256,0.0,3655.47,0.0,399,399,0.0,1438.37,0.0,157,157
cui2vec_pretrained,77.79,2267.58,0.0296,120,256,130.27,3525.2,0.0304,214,399,13.36,1425.01,0.0081,102,157
DeVine_etal_200,74.03,2271.33,0.0281,136,256,102.49,3552.98,0.0215,226,399,13.54,1424.83,0.0057,106,157
PMC-w2v,130.49,2214.88,0.0427,156,256,527.35,3128.12,0.1103,72,399,136.43,1301.94,0.0652,47,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,52.99,2292.37,0.0182,200,256,177.5,3477.97,0.0383,240,399,41.29,1397.08,0.0206,113,157
PubMed-and-PMC-w2v,139.37,2205.99,0.0421,159,256,507.75,3147.72,0.0981,84,399,133.77,1304.59,0.0614,41,157
GoogleNews-vectors-negative300,88.54,2256.83,0.0242,182,256,451.07,3204.4,0.0803,113,399,130.67,1307.69,0.0541,47,157
PubMed-w2v,127.99,2217.37,0.0371,163,256,500.31,3155.16,0.0962,83,399,149.61,1288.75,0.062,42,157
tweetsvec,35.22,2310.14,0.013,219,256,83.89,3571.58,0.0191,284,399,11.08,1427.29,0.0062,134,157


In [61]:
df_ks[4]

Unnamed: 0_level_0,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed,pos_dcg,neg_dcg,perc_dcg,oov,#seed
claims_cuis_hs_300.txt,27.93,2811.38,0.0077,209,256,1.08,4424.24,0.0003,389,399,0.5,1740.79,0.0002,154,157
stanford_cuis_svd_300.txt,0.0,2839.3,0.0,256,256,0.0,4425.32,0.0,399,399,0.0,1741.29,0.0,157,157
cui2vec_pretrained,90.32,2748.98,0.0285,120,256,142.92,4282.4,0.0268,214,399,14.72,1726.57,0.0072,102,157
DeVine_etal_200,83.73,2755.57,0.026,136,256,110.62,4314.7,0.0187,226,399,14.11,1727.18,0.0048,106,157
PMC-w2v,137.58,2701.73,0.0355,155,256,573.28,3852.04,0.0984,71,399,141.92,1599.37,0.0535,42,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,57.73,2781.58,0.0163,200,256,189.37,4235.95,0.0328,231,399,43.75,1697.54,0.0175,112,157
PubMed-and-PMC-w2v,146.79,2692.52,0.0354,155,256,539.59,3885.74,0.0844,81,399,138.05,1603.24,0.0495,38,157
GoogleNews-vectors-negative300,91.3,2748.01,0.0196,180,256,474.27,3951.06,0.068,104,399,133.49,1607.8,0.0433,45,157
PubMed-w2v,135.36,2703.95,0.0316,162,256,533.22,3892.1,0.0828,79,399,156.85,1584.44,0.0525,36,157
tweetsvec,40.23,2799.07,0.0123,215,256,93.78,4331.54,0.0175,276,399,13.03,1728.26,0.0062,132,157
