In [5]:
import datetime, measures, mmi_txt_to_cui, umls_tables_processing, utils
import numpy as np
import os
import pandas as pd

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

### Constants

In [4]:
PATH_EMBEDDINGS = './Embeddings'

### Extracting all the labels for each CUI of the UMLS vocabulary

In [6]:
dict_conso = umls_tables_processing.cui_strings()

0:01:27


In [7]:
len(dict_conso)

3772450

In [8]:
dict_conso['C1963065']

['Apnea, CTCAE', 'Apnea']

## Building  ***seed_rel***

### Extracting all the CUIs related to COPD (only one hop) and its relationships

In [9]:
copd_dict = umls_tables_processing.concepts_related_to_concept(two_way = True, extract_labels = False )
copd_cuis = list(copd_dict.keys())

0:01:07


#### Check on the relationships proportions in the list: `number of CUI with particular relations / total number of found CUIs`

In [9]:
_ = umls_tables_processing.concepts_related_to_concept(two_way = True, polishing_rels = True, extract_labels = False )
print(str(sum([1 for i in list(_.values()) if len(i)>0]))+'/'+str(len(_)))

Relation '' discarded 
101/256


#### Check which concepts have the empty relation

In [7]:
_ = umls_tables_processing.concepts_related_to_concept(two_way = True, polishing_rels = False, extract_labels = False )
print(str(sum([1 for i in list(_.values()) if '' in i]))+'/'+str(len(_)))

0:01:10
177/256


### Check the semantic types contained into the *seed_rel*

In [8]:
eval_rel = umls_tables_processing.extracting_stys(copd_cuis)
t = mmi_txt_to_cui.check_sty_mmi(mmi_txt_to_cui.convert_sty_stymmi(eval_rel))
print({k: j for k,j in list(t.items())[:5]})

0:09:41
{'dsyn': (105, 'Disease or Syndrome'), 'ftcn': (25, 'Functional Concept'), 'patf': (12, 'Pathologic Function'), 'hlca': (12, 'Health Care Activity'), 'topp': (11, 'Therapeutic or Preventive Procedure')}


In [10]:
seed_rel, _ = umls_tables_processing.extracting_strings(copd_cuis)

0:00:00


## Building ***seed_paper***

In [11]:
paper_cuis = mmi_txt_to_cui.mmi_to_cui(sty = True)

0:00:00


### Check the semantic types contained into the *seed_paper*

In [12]:
a = mmi_txt_to_cui.check_sty_mmi(paper_cuis)
print({k: j for k,j in list(a.items())[:5]})

{'qlco': (49, 'Qualitative Concept'), 'fndg': (38, 'Finding'), 'inpr': (35, 'Intellectual Product'), 'qnco': (34, 'Quantitative Concept'), 'ftcn': (30, 'Functional Concept')}


In [13]:
seed_paper, _ = umls_tables_processing.extracting_strings([i[0] for i in paper_cuis])

0:00:00


## Building ***seed_paper_lite***

In [14]:
paper_lite_cuis, oov_lite = mmi_txt_to_cui.mmi_lite_freetext(sty = True)

0:03:35


In [15]:
paper_lite_cuis_ = list(set([i[0] for i in paper_lite_cuis]))

### Check the semantic types contained into *the seed_paper_lite*

In [14]:
c = mmi_txt_to_cui.check_sty_mmi(paper_lite_cuis)
print({k: j for k,j in list(c.items())[:5]})

{'inpr': (26, 'Intellectual Product'), 'qlco': (21, 'Qualitative Concept'), 'fndg': (20, 'Finding'), 'idcn': (16, 'Idea or Concept'), 'topp': (13, 'Therapeutic or Preventive Procedure')}


In [15]:
oov_lite

['[84–88].\n', '[92, 93]. \n']

In [16]:
seed_paper_lite, _ = umls_tables_processing.extracting_strings(paper_lite_cuis_)

0:00:00


## Loading Models

### Importing the embeddings file names

In [17]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
embeddings = [cuis, words]
embeddings

[('/cuis/',
  ['claims_cuis_hs_300.txt.gz',
   'stanford_cuis_svd_300.txt.gz',
   'cui2vec_pretrained.txt',
   'DeVine_etal_200.txt']),
 ('/words/',
  ['PMC-w2v.bin',
   'Health_2.5mreviews.s200.w10.n5.v15.cbow.bin',
   'PubMed-and-PMC-w2v.bin',
   'GoogleNews-vectors-negative300.bin',
   'PubMed-w2v.bin',
   'tweetsvec.txt',
   'wikipedia-pubmed-and-PMC-w2v.bin'])]

### Creating a list of k values to test

In [18]:
ks = [5, 10, 20, 30, 40]
#ks = [5, 10]

### Creating a list of seed tuples

In [18]:
seeds = [('seed_rel', seed_rel), ('seed_paper', seed_paper), ('seed_paper_lite', seed_paper_lite)]

### Creating a dictionary of all the embeddings tested, over the seeds and k values.

In [20]:
a = datetime.datetime.now().replace(microsecond=0)
big_g = {}

for type_emb in embeddings:
    for emb in type_emb[1]:
        model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
        name = os.path.splitext(emb)[0]
        big_g[name] = {}
        print('\n\n The name of embedding is: %s\n' % name)
        for k in ks:
            print('\n k_value: %s\n' % k)
            big_g[name][k] = {}
            for seed in seeds:
                if type_emb[0]=='/cuis/':
                    d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k)
                    big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                               measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                               measures.percentage_dcg(d, k=k),
                                               measures.oov(d),
                                               len(seed[1]), []]
                elif type_emb[0]=='/words/':
                    d, new_seed = measures.occurred_labels(model, seed[1], k_most_similar=k)
                    big_g[name][k][seed[0]] = [measures.pos_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                               measures.neg_dcg(d, normalization = True, norm_fact = measures.max_dcg(k)),
                                               measures.percentage_dcg(d, k=k),
                                               measures.oov(d),
                                               len(seed[1]),
                                               new_seed]
                print('{:s}: pos_dcg: {:.4f}, neg_dcg: {:.4f}, perc_dcg: {:.4f}, oov: {:d}, #seed: {:d}\n'.
                      format(seed[0],
                             big_g[name][k][seed[0]][0],
                             big_g[name][k][seed[0]][1],
                             big_g[name][k][seed[0]][2],
                             big_g[name][k][seed[0]][3],
                             big_g[name][k][seed[0]][4]))
print(datetime.datetime.now().replace(microsecond=0)-a)        



 The name of embedding is: claims_cuis_hs_300.txt


 k_value: 5

seed_rel: pos_dcg: 0.0192, neg_dcg: 0.9808, perc_dcg: 0.0156, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 1.0000, perc_dcg: 0.0000, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0011, neg_dcg: 0.9989, perc_dcg: 0.0013, oov: 154, #seed: 157


 k_value: 10

seed_rel: pos_dcg: 0.0141, neg_dcg: 0.9859, perc_dcg: 0.0102, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0000, neg_dcg: 1.0000, perc_dcg: 0.0000, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0007, neg_dcg: 0.9993, perc_dcg: 0.0006, oov: 154, #seed: 157


 k_value: 20

seed_rel: pos_dcg: 0.0114, neg_dcg: 0.9886, perc_dcg: 0.0084, oov: 209, #seed: 256

seed_paper: pos_dcg: 0.0002, neg_dcg: 0.9998, perc_dcg: 0.0003, oov: 389, #seed: 399

seed_paper_lite: pos_dcg: 0.0005, neg_dcg: 0.9995, perc_dcg: 0.0003, oov: 154, #seed: 157


 k_value: 30

seed_rel: pos_dcg: 0.0106, neg_dcg: 0.9894, perc_dcg: 0.0082, oov: 209, #seed: 256

seed_paper: pos_dcg: 

  return (m / dist).astype(REAL)




 The name of embedding is: cui2vec_pretrained


 k_value: 5

seed_rel: pos_dcg: 0.0448, neg_dcg: 0.9552, perc_dcg: 0.0406, oov: 120, #seed: 256

seed_paper: pos_dcg: 0.0554, neg_dcg: 0.9446, perc_dcg: 0.0546, oov: 214, #seed: 399

seed_paper_lite: pos_dcg: 0.0156, neg_dcg: 0.9844, perc_dcg: 0.0166, oov: 102, #seed: 157


 k_value: 10

seed_rel: pos_dcg: 0.0400, neg_dcg: 0.9600, perc_dcg: 0.0359, oov: 120, #seed: 256

seed_paper: pos_dcg: 0.0481, neg_dcg: 0.9519, perc_dcg: 0.0446, oov: 214, #seed: 399

seed_paper_lite: pos_dcg: 0.0128, neg_dcg: 0.9872, perc_dcg: 0.0121, oov: 102, #seed: 157


 k_value: 20

seed_rel: pos_dcg: 0.0350, neg_dcg: 0.9650, perc_dcg: 0.0309, oov: 120, #seed: 256

seed_paper: pos_dcg: 0.0390, neg_dcg: 0.9610, perc_dcg: 0.0335, oov: 214, #seed: 399

seed_paper_lite: pos_dcg: 0.0098, neg_dcg: 0.9902, perc_dcg: 0.0083, oov: 102, #seed: 157


 k_value: 30

seed_rel: pos_dcg: 0.0332, neg_dcg: 0.9668, perc_dcg: 0.0296, oov: 120, #seed: 256

seed_paper: pos_dcg: 0.03

0:03:25
seed_paper: pos_dcg: 0.1509, neg_dcg: 0.8491, perc_dcg: 0.1031, oov: 62, #seed: 399

0:01:06
seed_paper_lite: pos_dcg: 0.1143, neg_dcg: 0.8857, perc_dcg: 0.0742, oov: 20, #seed: 157


 k_value: 30

0:01:05
seed_rel: pos_dcg: 0.0378, neg_dcg: 0.9622, perc_dcg: 0.0242, oov: 146, #seed: 256

0:03:24
seed_paper: pos_dcg: 0.1234, neg_dcg: 0.8766, perc_dcg: 0.0803, oov: 62, #seed: 399

0:01:05
seed_paper_lite: pos_dcg: 0.0908, neg_dcg: 0.9092, perc_dcg: 0.0541, oov: 20, #seed: 157


 k_value: 40

0:01:04
seed_rel: pos_dcg: 0.0322, neg_dcg: 0.9678, perc_dcg: 0.0196, oov: 146, #seed: 256

0:03:25
seed_paper: pos_dcg: 0.1072, neg_dcg: 0.8928, perc_dcg: 0.0680, oov: 62, #seed: 399

0:01:05
seed_paper_lite: pos_dcg: 0.0767, neg_dcg: 0.9233, perc_dcg: 0.0433, oov: 20, #seed: 157



 The name of embedding is: PubMed-w2v


 k_value: 5

0:00:52
seed_rel: pos_dcg: 0.1156, neg_dcg: 0.8844, perc_dcg: 0.0930, oov: 136, #seed: 256

0:02:05
seed_paper: pos_dcg: 0.2924, neg_dcg: 0.7076, perc_dcg: 0.

### Saving the obtained dictionary of all the embeddings tested, over the seeds and k values.

In [23]:
a = datetime.datetime.now().replace(microsecond=0)
utils.inputs_save(big_g, 'Utilities/big_g')
print(datetime.datetime.now().replace(microsecond=0)-a)

0:00:00


### Tabling data of dictionary

In [61]:
df_ks = []
for k in ks:
    names = list(big_g.keys())
    seeds = list(big_g[name][k].keys())
    header = [np.array([k]*18),
              np.array([seeds[0],seeds[0],seeds[0],seeds[0],seeds[0], seeds[0],
                        seeds[1],seeds[1],seeds[1],seeds[1],seeds[1], seeds[1],
                        seeds[2],seeds[2],seeds[2],seeds[2],seeds[2], seeds[2]]), 
              np.array(['pos_dcg', 'neg_dcg', 'perc_dcg', 'iov%', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'iov%', 'oov', '#seed',
                        'pos_dcg', 'neg_dcg', 'perc_dcg', 'iov%', 'oov', '#seed'])] 
    tmp = [ ['%.3f'%(big_g[name][k][seed][0]),
             '%.3f'%(big_g[name][k][seed][1]),
             '%.3f'%(big_g[name][k][seed][2]),
             '%.2f'%(((big_g[name][k][seed][4] - big_g[name][k][seed][3])/(big_g[name][k][seed][4]))*100)+'%',
             '%d'%(big_g[name][k][seed][3]),
             '%d'%(big_g[name][k][seed][4])] for name in names for seed in seeds]
    df_ks.append(pd.DataFrame(np.reshape(tmp, (11, 18)), index=np.array(names), columns = header))

In [62]:
df_ks[0]

Unnamed: 0_level_0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed
claims_cuis_hs_300.txt,0.019,0.981,0.016,18.36%,209,256,0.0,1.0,0.0,2.51%,389,399,0.001,0.999,0.001,1.91%,154,157
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0.00%,256,256,0.0,1.0,0.0,0.00%,399,399,0.0,1.0,0.0,0.00%,157,157
cui2vec_pretrained,0.045,0.955,0.041,53.12%,120,256,0.055,0.945,0.055,46.37%,214,399,0.016,0.984,0.017,35.03%,102,157
DeVine_etal_200,0.044,0.956,0.041,46.88%,136,256,0.053,0.947,0.048,43.36%,226,399,0.02,0.98,0.014,32.48%,106,157
PMC-w2v,0.103,0.897,0.086,46.48%,137,256,0.275,0.725,0.245,88.22%,47,399,0.195,0.805,0.167,88.54%,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.041,0.959,0.04,29.30%,181,256,0.089,0.911,0.083,48.12%,207,399,0.058,0.942,0.05,42.04%,91,157
PubMed-and-PMC-w2v,0.123,0.877,0.105,46.88%,136,256,0.292,0.708,0.254,88.47%,46,399,0.205,0.795,0.172,88.54%,18,157
GoogleNews-vectors-negative300,0.085,0.915,0.07,42.97%,146,256,0.281,0.719,0.242,84.46%,62,399,0.223,0.777,0.191,87.26%,20,157
PubMed-w2v,0.116,0.884,0.093,46.88%,136,256,0.292,0.708,0.256,88.47%,46,399,0.247,0.753,0.197,88.54%,18,157
tweetsvec,0.021,0.979,0.019,24.61%,193,256,0.035,0.965,0.03,42.11%,231,399,0.014,0.986,0.013,37.58%,98,157


In [63]:
df_ks[1]

Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed
claims_cuis_hs_300.txt,0.014,0.986,0.01,18.36%,209,256,0.0,1.0,0.0,2.51%,389,399,0.001,0.999,0.001,1.91%,154,157
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0.00%,256,256,0.0,1.0,0.0,0.00%,399,399,0.0,1.0,0.0,0.00%,157,157
cui2vec_pretrained,0.04,0.96,0.036,53.12%,120,256,0.048,0.952,0.045,46.37%,214,399,0.013,0.987,0.012,35.03%,102,157
DeVine_etal_200,0.037,0.963,0.032,46.88%,136,256,0.04,0.96,0.032,43.36%,226,399,0.016,0.984,0.011,32.48%,106,157
PMC-w2v,0.084,0.916,0.07,46.48%,137,256,0.216,0.784,0.177,88.22%,47,399,0.147,0.853,0.111,88.54%,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.032,0.968,0.028,29.30%,181,256,0.072,0.928,0.062,48.12%,207,399,0.046,0.954,0.039,42.04%,91,157
PubMed-and-PMC-w2v,0.094,0.906,0.074,46.88%,136,256,0.219,0.781,0.17,88.47%,46,399,0.149,0.851,0.108,88.54%,18,157
GoogleNews-vectors-negative300,0.064,0.936,0.046,42.97%,146,256,0.208,0.792,0.157,84.46%,62,399,0.165,0.835,0.125,87.26%,20,157
PubMed-w2v,0.089,0.911,0.068,46.88%,136,256,0.217,0.783,0.169,88.47%,46,399,0.179,0.821,0.125,88.54%,18,157
tweetsvec,0.019,0.981,0.016,24.61%,193,256,0.031,0.969,0.027,42.11%,231,399,0.012,0.988,0.01,37.58%,98,157


In [64]:
df_ks[2]

Unnamed: 0_level_0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,18.36%,209,256,0.0,1.0,0.0,2.51%,389,399,0.0,1.0,0.0,1.91%,154,157
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0.00%,256,256,0.0,1.0,0.0,0.00%,399,399,0.0,1.0,0.0,0.00%,157,157
cui2vec_pretrained,0.035,0.965,0.031,53.12%,120,256,0.039,0.961,0.033,46.37%,214,399,0.01,0.99,0.008,35.03%,102,157
DeVine_etal_200,0.034,0.966,0.031,46.88%,136,256,0.032,0.968,0.025,43.36%,226,399,0.011,0.989,0.007,32.48%,106,157
PMC-w2v,0.066,0.934,0.052,46.48%,137,256,0.169,0.831,0.132,88.22%,47,399,0.118,0.882,0.088,88.54%,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.026,0.974,0.021,29.30%,181,256,0.057,0.943,0.046,48.12%,207,399,0.035,0.965,0.027,42.04%,91,157
PubMed-and-PMC-w2v,0.071,0.929,0.052,46.88%,136,256,0.166,0.834,0.121,88.47%,46,399,0.113,0.887,0.079,88.54%,18,157
GoogleNews-vectors-negative300,0.046,0.954,0.031,42.97%,146,256,0.151,0.849,0.103,84.46%,62,399,0.114,0.886,0.074,87.26%,20,157
PubMed-w2v,0.066,0.934,0.048,46.88%,136,256,0.164,0.836,0.119,88.47%,46,399,0.129,0.871,0.082,88.54%,18,157
tweetsvec,0.017,0.983,0.015,24.61%,193,256,0.026,0.974,0.021,42.11%,231,399,0.009,0.991,0.008,37.58%,98,157


In [65]:
df_ks[3]

Unnamed: 0_level_0,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,18.36%,209,256,0.0,1.0,0.0,2.51%,389,399,0.0,1.0,0.0,1.91%,154,157
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0.00%,256,256,0.0,1.0,0.0,0.00%,399,399,0.0,1.0,0.0,0.00%,157,157
cui2vec_pretrained,0.033,0.967,0.03,53.12%,120,256,0.036,0.964,0.03,46.37%,214,399,0.009,0.991,0.008,35.03%,102,157
DeVine_etal_200,0.032,0.968,0.028,46.88%,136,256,0.028,0.972,0.021,43.36%,226,399,0.009,0.991,0.006,32.48%,106,157
PMC-w2v,0.056,0.944,0.043,46.48%,137,256,0.144,0.856,0.11,88.22%,47,399,0.095,0.905,0.065,88.54%,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.023,0.977,0.018,29.30%,181,256,0.049,0.951,0.038,48.12%,207,399,0.029,0.971,0.021,42.04%,91,157
PubMed-and-PMC-w2v,0.059,0.941,0.042,46.88%,136,256,0.139,0.861,0.098,88.47%,46,399,0.093,0.907,0.061,88.54%,18,157
GoogleNews-vectors-negative300,0.038,0.962,0.024,42.97%,146,256,0.123,0.877,0.08,84.46%,62,399,0.091,0.909,0.054,87.26%,20,157
PubMed-w2v,0.055,0.945,0.037,46.88%,136,256,0.137,0.863,0.096,88.47%,46,399,0.104,0.896,0.062,88.54%,18,157
tweetsvec,0.015,0.985,0.013,24.61%,193,256,0.023,0.977,0.019,42.11%,231,399,0.008,0.992,0.006,37.58%,98,157


In [66]:
df_ks[4]

Unnamed: 0_level_0,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite,seed_paper_lite
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov%,oov,#seed
claims_cuis_hs_300.txt,0.01,0.99,0.008,18.36%,209,256,0.0,1.0,0.0,2.51%,389,399,0.0,1.0,0.0,1.91%,154,157
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0.00%,256,256,0.0,1.0,0.0,0.00%,399,399,0.0,1.0,0.0,0.00%,157,157
cui2vec_pretrained,0.032,0.968,0.029,53.12%,120,256,0.032,0.968,0.027,46.37%,214,399,0.008,0.992,0.007,35.03%,102,157
DeVine_etal_200,0.029,0.971,0.026,46.88%,136,256,0.025,0.975,0.019,43.36%,226,399,0.008,0.992,0.005,32.48%,106,157
PMC-w2v,0.048,0.952,0.036,46.48%,137,256,0.13,0.87,0.098,88.22%,47,399,0.082,0.918,0.054,88.54%,18,157
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.02,0.98,0.016,29.30%,181,256,0.043,0.957,0.033,48.12%,207,399,0.025,0.975,0.018,42.04%,91,157
PubMed-and-PMC-w2v,0.052,0.948,0.035,46.88%,136,256,0.122,0.878,0.084,88.47%,46,399,0.079,0.921,0.05,88.54%,18,157
GoogleNews-vectors-negative300,0.032,0.968,0.02,42.97%,146,256,0.107,0.893,0.068,84.46%,62,399,0.077,0.923,0.043,87.26%,20,157
PubMed-w2v,0.048,0.952,0.032,46.88%,136,256,0.12,0.88,0.083,88.47%,46,399,0.09,0.91,0.053,88.54%,18,157
tweetsvec,0.014,0.986,0.012,24.61%,193,256,0.021,0.979,0.018,42.11%,231,399,0.007,0.993,0.006,37.58%,98,157


### Creating a dictionary of all the embeddings tested, over the max k value, corresponding to the number of IoV elements.

In [19]:
a = datetime.datetime.now().replace(microsecond=0)
big_k = {}

for type_emb in embeddings:
    for emb in type_emb[1]:
        model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+type_emb[0]+emb, binary=emb.endswith('.bin'))
        name = os.path.splitext(emb)[0]
        big_k[name] = {}
        print('\n\n The name of embedding is: %s\n' % name)
        for seed in seeds:
            if type_emb[0]=='/cuis/':
                Vemb = utils.extract_w2v_vocab(model)
                k_iov = len(list(set(Vemb).intersection(set(seed[1].keys()))))
                if k_iov <=0:
                    k_iov = 1
                
                d = measures.occurred_concept(model, seed[1].keys(), k_most_similar=k_iov)
                big_k[name][seed[0]] = [measures.percentage_dcg(d, k=k_iov),
                                        k_iov,
                                        measures.oov(d),
                                        len(seed[1]), []]
            
            elif type_emb[0]=='/words/':
                Vemb = utils.extract_w2v_vocab(model)                
                processed_seed = umls_tables_processing.discarding_labels_oov(Vemb, seed[1])
                k_iov = sum([1 for k,v in processed_seed.items() if len(v)>0])
                if k_iov <= 0:
                    k_iov = 1
                
                d, _ = measures.occurred_labels(model, processed_seed, k_most_similar=k_iov)
                big_k[name][seed[0]] = [measures.percentage_dcg(d, k=k_iov),
                                        k_iov,
                                        measures.oov(d),
                                        len(seed[1]), []]
            print('{:s}: perc_dcg: {:.4f}, iov/k-NN: {:d}, oov: {:d}, #seed: {:d}\n'.
                  format(seed[0],
                         big_k[name][seed[0]][0],
                         big_k[name][seed[0]][1],
                         big_k[name][seed[0]][2],
                         big_k[name][seed[0]][3]))
print(datetime.datetime.now().replace(microsecond=0)-a)



 The name of embedding is: claims_cuis_hs_300.txt

seed_rel: perc_dcg: 0.0078, iov/k-NN: 47, oov: 209, #seed: 256

seed_paper: perc_dcg: 0.0000, iov/k-NN: 10, oov: 389, #seed: 399

seed_paper_lite: perc_dcg: 0.0021, iov/k-NN: 3, oov: 154, #seed: 157



 The name of embedding is: stanford_cuis_svd_300.txt

seed_rel: perc_dcg: 0.0000, iov/k-NN: 1, oov: 256, #seed: 256

seed_paper: perc_dcg: 0.0000, iov/k-NN: 1, oov: 399, #seed: 399

seed_paper_lite: perc_dcg: 0.0000, iov/k-NN: 1, oov: 157, #seed: 157



  return (m / dist).astype(REAL)




 The name of embedding is: cui2vec_pretrained

seed_rel: perc_dcg: 0.0198, iov/k-NN: 136, oov: 120, #seed: 256

seed_paper: perc_dcg: 0.0155, iov/k-NN: 185, oov: 214, #seed: 399

seed_paper_lite: perc_dcg: 0.0060, iov/k-NN: 55, oov: 102, #seed: 157



 The name of embedding is: DeVine_etal_200

seed_rel: perc_dcg: 0.0177, iov/k-NN: 120, oov: 136, #seed: 256

seed_paper: perc_dcg: 0.0099, iov/k-NN: 173, oov: 226, #seed: 399

seed_paper_lite: perc_dcg: 0.0042, iov/k-NN: 51, oov: 106, #seed: 157



 The name of embedding is: PMC-w2v

0:00:00
0:00:55
seed_rel: perc_dcg: 0.0205, iov/k-NN: 119, oov: 137, #seed: 256

0:00:01
0:02:34
seed_paper: perc_dcg: 0.0452, iov/k-NN: 352, oov: 47, #seed: 399

0:00:01
0:01:05
seed_paper_lite: perc_dcg: 0.0244, iov/k-NN: 139, oov: 18, #seed: 157



 The name of embedding is: Health_2.5mreviews.s200.w10.n5.v15.cbow

0:00:00
0:00:00
seed_rel: perc_dcg: 0.0119, iov/k-NN: 75, oov: 181, #seed: 256

0:00:00
0:00:02
seed_paper: perc_dcg: 0.0193, iov/k-NN: 192, 

In [31]:
df_kmax = []
names = list(big_k.keys())
seeds = list(big_k[name].keys())
header = [np.array([seeds[0]+' #256',seeds[0]+' #256',seeds[0]+' #256',
                    seeds[1]+' #399',seeds[1]+' #399',seeds[1]+' #399',
                    seeds[2]+' #157',seeds[2]+' #157',seeds[2]+' #157']),
          np.array(['perc_dcg', 'iov/k-NN', 'oov',
                    'perc_dcg', 'iov/k-NN', 'oov',
                    'perc_dcg', 'iov/k-NN', 'oov'])] 

tmp = [ ['%.3f'%(big_k[name][seed][0]),
         '%d'%(big_k[name][seed][1]),
         '%d'%(big_k[name][seed][2])] for name in names for seed in seeds]
df_kmax.append(pd.DataFrame(np.reshape(tmp, (11, 9)), index=np.array(names), columns = header))

In [32]:
df_kmax[0]

Unnamed: 0_level_0,seed_rel #256,seed_rel #256,seed_rel #256,seed_paper #399,seed_paper #399,seed_paper #399,seed_paper_lite #157,seed_paper_lite #157,seed_paper_lite #157
Unnamed: 0_level_1,perc_dcg,iov/k-NN,oov,perc_dcg,iov/k-NN,oov,perc_dcg,iov/k-NN,oov
claims_cuis_hs_300.txt,0.008,47,209,0.0,10,389,0.002,3,154
stanford_cuis_svd_300.txt,0.0,1,256,0.0,1,399,0.0,1,157
cui2vec_pretrained,0.02,136,120,0.015,185,214,0.006,55,102
DeVine_etal_200,0.018,120,136,0.01,173,226,0.004,51,106
PMC-w2v,0.021,119,137,0.045,352,47,0.024,139,18
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.012,75,181,0.019,192,207,0.013,66,91
PubMed-and-PMC-w2v,0.019,120,136,0.032,353,46,0.022,139,18
GoogleNews-vectors-negative300,0.012,110,146,0.019,337,62,0.016,137,20
PubMed-w2v,0.018,120,136,0.028,353,46,0.022,139,18
tweetsvec,0.012,63,193,0.011,168,231,0.005,59,98


In [33]:
a = datetime.datetime.now().replace(microsecond=0)
utils.inputs_save(big_k, 'Utilities/big_k')
print(datetime.datetime.now().replace(microsecond=0)-a)

0:00:00
