In [1]:
import datetime, measures, mmi_txt_to_cui, relatedness_pipeline, umls_tables_processing, utils
import numpy as np
import os
import pandas as pd

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

0:00:11


### Relationships evaluation

Specific relationships

In [2]:
n_rela = umls_tables_processing.count_relationships()

844
0:00:50


General relationships

In [3]:
n_rel = umls_tables_processing.count_relationships(rel_type = 'REL')

10
0:00:48


In [4]:
n_rel

{'AQ', 'CHD', 'PAR', 'QB', 'RB', 'RN', 'RO', 'RQ', 'SIB', 'SY'}

### Constants

In [5]:
PATH_EMBEDDINGS = './Embeddings'

### Extracting all the labels for each CUI of the UMLS vocabulary

Creation of a python dictionary corresponding to UMLS vocabulary.

Our dictionary has *CUIs* as keys and *label concepts* as values 

`all_labels = True` is a switch for considering all the labels for each concept-*CUI*

In [6]:
dict_conso = umls_tables_processing.cui_strings(all_labels = True)

0:00:31


In [7]:
dict_conso['C1963065']

['Apnea, CTCAE', 'Apnea']

`all_labels = False` allows to pick the UMLS' best ranked label for each concept-*CUI*. 

In [8]:
dict_conso_pref = umls_tables_processing.cui_strings(all_labels = False)

0:00:27


In [9]:
print('The dimension of the two dictionaries is the same: ' + str(len(dict_conso) == len(dict_conso_pref)))
print('The two dictionaries have a dimension of ' +str(len(dict_conso))+' elements'+'\n')

The dimension of the two dictionaries is the same: True
The two dictionaries have a dimension of 3772450 elements



# Building seed

#### Extracting all the CUIs related to COPD (only one hop) and its relationships

The connection to `COPD` can be bi-directional: `concept_x` --> `COPD` and also `COPD` --> `concept_x`

This aspect is implemented thanks the switch `two_way = True`. Only the `CUIs` are extracted. 

If also corresponding labels wanted, than `extract_labels = True`

In [10]:
copd_dict = umls_tables_processing.concepts_related_to_concept(two_way = True, extract_labels = False )
copd_cuis = list(copd_dict.keys())

Building seed time: 0:00:50


## ***seed_rel***

In [11]:
seed_rel, _ = umls_tables_processing.extracting_strings(copd_cuis)

Time for extracting labels: 0:00:00


#### Check on the relationships proportions in the list: `number of CUI with particular relations / total number of found CUIs`

In [9]:
_ = umls_tables_processing.concepts_related_to_concept(two_way = True, polishing_rels = True, extract_labels = False )
print(str(sum([1 for i in list(_.values()) if len(i)>0]))+'/'+str(len(_)))

Relation '' discarded 
101/256


#### Check which concepts have the empty relation

In [7]:
_ = umls_tables_processing.concepts_related_to_concept(two_way = True, polishing_rels = False, extract_labels = False )
print(str(sum([1 for i in list(_.values()) if '' in i]))+'/'+str(len(_)))

0:01:10
177/256


### Check the semantic types contained into the *seed_rel*

In [17]:
eval_rel = umls_tables_processing.extracting_stys(copd_cuis)
t = mmi_txt_to_cui.check_sty_mmi(mmi_txt_to_cui.convert_sty_stymmi(eval_rel))
print({k: j for k,j in list(t.items())[:5]})

0:07:59
{'dsyn': (105, 'Disease or Syndrome'), 'ftcn': (25, 'Functional Concept'), 'patf': (12, 'Pathologic Function'), 'hlca': (12, 'Health Care Activity'), 'topp': (11, 'Therapeutic or Preventive Procedure')}


## ***seed_paper***

In [12]:
paper_cuis = mmi_txt_to_cui.mmi_to_cui(sty = True)

0:00:00


In [13]:
seed_paper, _ = umls_tables_processing.extracting_strings([i[0] for i in paper_cuis])

Time for extracting labels: 0:00:00


### Check the semantic types contained into the *seed_paper*

In [20]:
a = mmi_txt_to_cui.check_sty_mmi(paper_cuis)
print({k: j for k,j in list(a.items())[:5]})

{'qlco': (49, 'Qualitative Concept'), 'fndg': (38, 'Finding'), 'inpr': (35, 'Intellectual Product'), 'qnco': (34, 'Quantitative Concept'), 'ftcn': (30, 'Functional Concept')}


### Check how many concepts are shared by the two previous seeds

In [21]:
print('Length seed_rel: ' + str(len(seed_rel)))
print('Length seed_paper: ' + str(len(seed_paper)))

Length seed_rel: 256
Length seed_paper: 399


In [22]:
print('The elements shared by two seeds are: ' + str(len(set(seed_rel.keys()).intersection(set(seed_paper.keys())))))

The elements shared by two seeds are: 18


# Building a union of the two seeds:

## ***seed_union***

In [14]:
seed_union = {**seed_rel, **seed_paper}

In [15]:
len(seed_union) == (len(seed_rel) + len(seed_paper))-18

True

## Building ***seed_paper_lite*** (DEPRECATED)

From the implementation, the MetaMap Lite version does not work with API service anymore, and given the bad performance related to the mapping, it has not been upgraded.

In [None]:
paper_lite_cuis, oov_lite = mmi_txt_to_cui.mmi_lite_freetext(sty = True)

In [None]:
paper_lite_cuis_ = list(set([i[0] for i in paper_lite_cuis]))

### Check the semantic types contained into *the seed_paper_lite*

In [14]:
c = mmi_txt_to_cui.check_sty_mmi(paper_lite_cuis)
print({k: j for k,j in list(c.items())[:5]})

{'inpr': (26, 'Intellectual Product'), 'qlco': (21, 'Qualitative Concept'), 'fndg': (20, 'Finding'), 'idcn': (16, 'Idea or Concept'), 'topp': (13, 'Therapeutic or Preventive Procedure')}


In [15]:
oov_lite

['[84–88].\n', '[92, 93]. \n']

In [None]:
seed_paper_lite, _ = umls_tables_processing.extracting_strings(paper_lite_cuis_)

## Loading Models

### Importing the embeddings file names

In [16]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
embeddings = [cuis, words]

### Creating a list of k values to test

In [17]:
ks = [5, 10, 20, 30, 40]
#ks = [5, 10]

### Creating a list of seed tuples

In [18]:
seeds = [('seed_rel', seed_rel), ('seed_paper', seed_paper), ('seed_union', seed_union)]

### Creating a dictionary of all the embeddings tested, over the seeds and k values.

Logger instance, for keeping track of processing and switch constants instance.

In [19]:
logger = utils.setup_custom_logger('myapp')
logger.info('Start\n')

max_k_switch = True

2022-04-02 18:45:54 INFO     Start



In [None]:
big_g = relatedness_pipeline.regular_ks_loop(embeddings,
                                             ks,
                                             seeds,
                                             logger,
                                             max_k_switch,
                                             all_labels = False,
                                             aggregation = 'max')

a = datetime.datetime.now().replace(microsecond=0)
utils.inputs_save(big_g, 'Utilities/big_g'+str(a))

## Support method, for DataFrame visualization

In [54]:
def table(big_g, ks):
    df_ks = []
    for k in ks:
        names = list(big_g.keys())
        seeds = list(big_g[names[0]][k].keys())
        header = [np.array([k]*18),
                  np.array([seeds[0],seeds[0],seeds[0],seeds[0],seeds[0], seeds[0],
                            seeds[1],seeds[1],seeds[1],seeds[1],seeds[1], seeds[1],
                            seeds[2],seeds[2],seeds[2],seeds[2],seeds[2], seeds[2]]), 
                  np.array(['pos_dcg', 'neg_dcg', 'perc_dcg', 'iov', 'oov', '#seed',
                            'pos_dcg', 'neg_dcg', 'perc_dcg', 'iov', 'oov', '#seed',
                            'pos_dcg', 'neg_dcg', 'perc_dcg', 'iov', 'oov', '#seed'])] 
        tmp = [ ['%.3f'%(big_g[name][k][seed][0]),
                 '%.3f'%(big_g[name][k][seed][1]),
                 '%.3f'%(big_g[name][k][seed][2]),
                 '%d'%(big_g[name][k][seed][3]),
                 '%d'%(big_g[name][k][seed][4]),
                 '%d'%(big_g[name][k][seed][5])] for name in names for seed in seeds]
        df_ks.append(pd.DataFrame(np.reshape(tmp, (11, 18)), index=np.array(names), columns = header))
    return df_ks



In [55]:
df_ks = table(big_g, [5,10,20,30,40, 'max_k'])

## Tables for each *k*

In [56]:
df_ks[0]

Unnamed: 0_level_0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.019,0.981,0.016,47,209,256,0.0,1.0,0.0,10,389,399,0.009,0.991,0.007,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.045,0.955,0.041,136,120,256,0.055,0.945,0.055,185,214,399,0.056,0.944,0.055,309,328,637
DeVine_etal_200,0.044,0.956,0.041,120,136,256,0.053,0.947,0.048,173,226,399,0.052,0.948,0.047,281,356,637
PMC-w2v,0.022,0.978,0.016,119,137,256,0.083,0.917,0.072,352,47,399,0.062,0.938,0.053,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.014,0.986,0.016,75,181,256,0.022,0.978,0.022,192,207,399,0.02,0.98,0.022,256,381,637
PubMed-and-PMC-w2v,0.033,0.967,0.028,120,136,256,0.082,0.918,0.074,353,46,399,0.065,0.935,0.058,460,177,637
GoogleNews-vectors-negative300,0.015,0.985,0.013,110,146,256,0.048,0.952,0.04,337,62,399,0.037,0.963,0.032,434,203,637
PubMed-w2v,0.03,0.97,0.026,120,136,256,0.089,0.911,0.077,353,46,399,0.068,0.932,0.059,460,177,637
tweetsvec,0.02,0.98,0.018,63,193,256,0.019,0.981,0.018,168,231,399,0.024,0.976,0.021,221,416,637


In [57]:
df_ks[1]

Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.014,0.986,0.01,47,209,256,0.0,1.0,0.0,10,389,399,0.007,0.993,0.005,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.04,0.96,0.036,136,120,256,0.048,0.952,0.045,185,214,399,0.049,0.951,0.045,309,328,637
DeVine_etal_200,0.037,0.963,0.032,120,136,256,0.04,0.96,0.032,173,226,399,0.042,0.958,0.035,281,356,637
PMC-w2v,0.018,0.982,0.013,119,137,256,0.068,0.932,0.055,352,47,399,0.051,0.949,0.042,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.013,0.987,0.014,75,181,256,0.02,0.98,0.019,192,207,399,0.019,0.981,0.019,256,381,637
PubMed-and-PMC-w2v,0.026,0.974,0.021,120,136,256,0.064,0.936,0.053,353,46,399,0.052,0.948,0.043,460,177,637
GoogleNews-vectors-negative300,0.012,0.988,0.011,110,146,256,0.036,0.964,0.028,337,62,399,0.029,0.971,0.023,434,203,637
PubMed-w2v,0.025,0.975,0.021,120,136,256,0.067,0.933,0.052,353,46,399,0.053,0.947,0.041,460,177,637
tweetsvec,0.016,0.984,0.013,63,193,256,0.019,0.981,0.018,168,231,399,0.022,0.978,0.02,221,416,637


In [58]:
df_ks[2]

Unnamed: 0_level_0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.006,0.994,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.035,0.965,0.031,136,120,256,0.039,0.961,0.033,185,214,399,0.042,0.958,0.037,309,328,637
DeVine_etal_200,0.034,0.966,0.031,120,136,256,0.032,0.968,0.025,173,226,399,0.036,0.964,0.03,281,356,637
PMC-w2v,0.016,0.984,0.013,119,137,256,0.055,0.945,0.044,352,47,399,0.043,0.957,0.035,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.013,0.987,0.013,75,181,256,0.018,0.982,0.016,192,207,399,0.018,0.982,0.017,256,381,637
PubMed-and-PMC-w2v,0.02,0.98,0.015,120,136,256,0.05,0.95,0.038,353,46,399,0.04,0.96,0.031,460,177,637
GoogleNews-vectors-negative300,0.01,0.99,0.009,110,146,256,0.027,0.973,0.019,337,62,399,0.022,0.978,0.016,434,203,637
PubMed-w2v,0.02,0.98,0.016,120,136,256,0.053,0.947,0.04,353,46,399,0.042,0.958,0.032,460,177,637
tweetsvec,0.014,0.986,0.012,63,193,256,0.016,0.984,0.015,168,231,399,0.02,0.98,0.018,221,416,637


In [59]:
df_ks[3]

Unnamed: 0_level_0,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.033,0.967,0.03,136,120,256,0.036,0.964,0.03,185,214,399,0.039,0.961,0.035,309,328,637
DeVine_etal_200,0.032,0.968,0.028,120,136,256,0.028,0.972,0.021,173,226,399,0.032,0.968,0.027,281,356,637
PMC-w2v,0.014,0.986,0.011,119,137,256,0.049,0.951,0.038,352,47,399,0.038,0.962,0.031,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.012,0.988,0.012,75,181,256,0.016,0.984,0.015,192,207,399,0.017,0.983,0.016,256,381,637
PubMed-and-PMC-w2v,0.017,0.983,0.012,120,136,256,0.043,0.957,0.031,353,46,399,0.034,0.966,0.026,460,177,637
GoogleNews-vectors-negative300,0.009,0.991,0.007,110,146,256,0.023,0.977,0.016,337,62,399,0.019,0.981,0.014,434,203,637
PubMed-w2v,0.017,0.983,0.013,120,136,256,0.045,0.955,0.033,353,46,399,0.036,0.964,0.027,460,177,637
tweetsvec,0.012,0.988,0.01,63,193,256,0.015,0.985,0.014,168,231,399,0.018,0.982,0.016,221,416,637


In [60]:
df_ks[4]

Unnamed: 0_level_0,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.01,0.99,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.032,0.968,0.029,136,120,256,0.032,0.968,0.027,185,214,399,0.036,0.964,0.032,309,328,637
DeVine_etal_200,0.029,0.971,0.026,120,136,256,0.025,0.975,0.019,173,226,399,0.029,0.971,0.024,281,356,637
PMC-w2v,0.012,0.988,0.009,119,137,256,0.045,0.955,0.035,352,47,399,0.035,0.965,0.028,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.011,0.989,0.01,75,181,256,0.015,0.985,0.013,192,207,399,0.016,0.984,0.014,256,381,637
PubMed-and-PMC-w2v,0.015,0.985,0.01,120,136,256,0.038,0.962,0.027,353,46,399,0.031,0.969,0.022,460,177,637
GoogleNews-vectors-negative300,0.008,0.992,0.006,110,146,256,0.02,0.98,0.014,337,62,399,0.017,0.983,0.012,434,203,637
PubMed-w2v,0.015,0.985,0.011,120,136,256,0.04,0.96,0.029,353,46,399,0.032,0.968,0.024,460,177,637
tweetsvec,0.012,0.988,0.01,63,193,256,0.014,0.986,0.012,168,231,399,0.017,0.983,0.015,221,416,637


### `max_k` corresponds to `|IV|` or all the seed elements inside the vocabulary

In [61]:
df_ks[5]

Unnamed: 0_level_0,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.01,0.99,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,1,256,256,0.0,1.0,0.0,1,399,399,0.0,1.0,0.0,1,637,637
cui2vec_pretrained,0.023,0.977,0.02,136,120,256,0.019,0.981,0.015,185,214,399,0.02,0.98,0.017,309,328,637
DeVine_etal_200,0.021,0.979,0.018,120,136,256,0.014,0.986,0.01,173,226,399,0.014,0.986,0.011,281,356,637
PMC-w2v,0.007,0.993,0.005,119,137,256,0.021,0.979,0.017,352,47,399,0.015,0.985,0.012,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.008,0.992,0.007,75,181,256,0.01,0.99,0.009,192,207,399,0.009,0.991,0.008,256,381,637
PubMed-and-PMC-w2v,0.009,0.991,0.006,120,136,256,0.015,0.985,0.011,353,46,399,0.011,0.989,0.008,460,177,637
GoogleNews-vectors-negative300,0.005,0.995,0.004,110,146,256,0.006,0.994,0.004,337,62,399,0.005,0.995,0.003,434,203,637
PubMed-w2v,0.009,0.991,0.006,120,136,256,0.015,0.985,0.01,353,46,399,0.01,0.99,0.007,460,177,637
tweetsvec,0.011,0.989,0.01,63,193,256,0.009,0.991,0.008,168,231,399,0.012,0.988,0.01,221,416,637


### An analysis of the embeddings cardinality is performed:

In [47]:
cardinality_vemb = relatedness_pipeline.cardinality_embeddings()

0:10:24


In [48]:
cardinality_vemb

{'claims_cuis_hs_300.txt': 14852,
 'stanford_cuis_svd_300.txt': 22705,
 'cui2vec_pretrained': 109053,
 'DeVine_etal_200': 52102,
 'PMC-w2v': 2515686,
 'Health_2.5mreviews.s200.w10.n5.v15.cbow': 73644,
 'PubMed-and-PMC-w2v': 4087446,
 'GoogleNews-vectors-negative300': 3000000,
 'PubMed-w2v': 2351706,
 'tweetsvec': 26278,
 'wikipedia-pubmed-and-PMC-w2v': 5443656}

In [57]:
utils.inputs_save(tmp, 'Utilities/cardinality_vembs')

The previous tables show results stored at the variable: `Utilities/big_g2022-04-02 21:45:19.pickle`.

The choosen heuristic for picking the label representative for each concept is the `max`: aka the best ranked label by UMLS 

In [None]:
big_g_med = relatedness_pipeline.regular_ks_loop(embeddings,
                                             ks,
                                             seeds,
                                             logger,
                                             max_k_switch,
                                             all_labels = False,
                                             aggregation = 'med')

a = datetime.datetime.now().replace(microsecond=0)
utils.inputs_save(big_g_med, 'Utilities/big_g_medoid_'+str(a))

In [62]:
df_ks_med = table(big_g_med, [5,10,20,30,40, 'max_k'])

In [63]:
df_ks_med[0]

Unnamed: 0_level_0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.019,0.981,0.016,47,209,256,0.0,1.0,0.0,10,389,399,0.009,0.991,0.007,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.045,0.955,0.041,136,120,256,0.055,0.945,0.055,185,214,399,0.056,0.944,0.055,309,328,637
DeVine_etal_200,0.044,0.956,0.041,120,136,256,0.053,0.947,0.048,173,226,399,0.052,0.948,0.047,281,356,637
PMC-w2v,0.022,0.978,0.016,119,137,256,0.083,0.917,0.072,352,47,399,0.062,0.938,0.053,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.014,0.986,0.016,75,181,256,0.022,0.978,0.022,192,207,399,0.02,0.98,0.022,256,381,637
PubMed-and-PMC-w2v,0.033,0.967,0.028,120,136,256,0.082,0.918,0.074,353,46,399,0.065,0.935,0.058,460,177,637
GoogleNews-vectors-negative300,0.015,0.985,0.013,110,146,256,0.048,0.952,0.04,337,62,399,0.037,0.963,0.032,434,203,637
PubMed-w2v,0.03,0.97,0.026,120,136,256,0.089,0.911,0.077,353,46,399,0.068,0.932,0.059,460,177,637
tweetsvec,0.02,0.98,0.018,63,193,256,0.019,0.981,0.018,168,231,399,0.024,0.976,0.021,221,416,637


In [64]:
df_ks_med[1]

Unnamed: 0_level_0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.014,0.986,0.01,47,209,256,0.0,1.0,0.0,10,389,399,0.007,0.993,0.005,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.04,0.96,0.036,136,120,256,0.048,0.952,0.045,185,214,399,0.049,0.951,0.045,309,328,637
DeVine_etal_200,0.037,0.963,0.032,120,136,256,0.04,0.96,0.032,173,226,399,0.042,0.958,0.035,281,356,637
PMC-w2v,0.018,0.982,0.013,119,137,256,0.068,0.932,0.055,352,47,399,0.051,0.949,0.042,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.013,0.987,0.014,75,181,256,0.02,0.98,0.019,192,207,399,0.019,0.981,0.019,256,381,637
PubMed-and-PMC-w2v,0.026,0.974,0.021,120,136,256,0.064,0.936,0.053,353,46,399,0.052,0.948,0.043,460,177,637
GoogleNews-vectors-negative300,0.012,0.988,0.011,110,146,256,0.036,0.964,0.028,337,62,399,0.029,0.971,0.023,434,203,637
PubMed-w2v,0.025,0.975,0.021,120,136,256,0.067,0.933,0.052,353,46,399,0.053,0.947,0.041,460,177,637
tweetsvec,0.016,0.984,0.013,63,193,256,0.019,0.981,0.018,168,231,399,0.022,0.978,0.02,221,416,637


In [65]:
df_ks_med[2]

Unnamed: 0_level_0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.006,0.994,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.035,0.965,0.031,136,120,256,0.039,0.961,0.033,185,214,399,0.042,0.958,0.037,309,328,637
DeVine_etal_200,0.034,0.966,0.031,120,136,256,0.032,0.968,0.025,173,226,399,0.036,0.964,0.03,281,356,637
PMC-w2v,0.016,0.984,0.013,119,137,256,0.055,0.945,0.044,352,47,399,0.043,0.957,0.035,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.013,0.987,0.013,75,181,256,0.018,0.982,0.016,192,207,399,0.018,0.982,0.017,256,381,637
PubMed-and-PMC-w2v,0.02,0.98,0.015,120,136,256,0.05,0.95,0.038,353,46,399,0.04,0.96,0.031,460,177,637
GoogleNews-vectors-negative300,0.01,0.99,0.009,110,146,256,0.027,0.973,0.019,337,62,399,0.022,0.978,0.016,434,203,637
PubMed-w2v,0.02,0.98,0.016,120,136,256,0.053,0.947,0.04,353,46,399,0.042,0.958,0.032,460,177,637
tweetsvec,0.014,0.986,0.012,63,193,256,0.016,0.984,0.015,168,231,399,0.02,0.98,0.018,221,416,637


In [66]:
df_ks_med[3]

Unnamed: 0_level_0,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.011,0.989,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.033,0.967,0.03,136,120,256,0.036,0.964,0.03,185,214,399,0.039,0.961,0.035,309,328,637
DeVine_etal_200,0.032,0.968,0.028,120,136,256,0.028,0.972,0.021,173,226,399,0.032,0.968,0.027,281,356,637
PMC-w2v,0.014,0.986,0.011,119,137,256,0.049,0.951,0.038,352,47,399,0.038,0.962,0.031,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.012,0.988,0.012,75,181,256,0.016,0.984,0.015,192,207,399,0.017,0.983,0.016,256,381,637
PubMed-and-PMC-w2v,0.017,0.983,0.012,120,136,256,0.043,0.957,0.031,353,46,399,0.034,0.966,0.026,460,177,637
GoogleNews-vectors-negative300,0.009,0.991,0.007,110,146,256,0.023,0.977,0.016,337,62,399,0.019,0.981,0.014,434,203,637
PubMed-w2v,0.017,0.983,0.013,120,136,256,0.045,0.955,0.033,353,46,399,0.036,0.964,0.027,460,177,637
tweetsvec,0.012,0.988,0.01,63,193,256,0.015,0.985,0.014,168,231,399,0.018,0.982,0.016,221,416,637


In [67]:
df_ks_med[4]

Unnamed: 0_level_0,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.01,0.99,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,0,256,256,0.0,1.0,0.0,0,399,399,0.0,1.0,0.0,0,637,637
cui2vec_pretrained,0.032,0.968,0.029,136,120,256,0.032,0.968,0.027,185,214,399,0.036,0.964,0.032,309,328,637
DeVine_etal_200,0.029,0.971,0.026,120,136,256,0.025,0.975,0.019,173,226,399,0.029,0.971,0.024,281,356,637
PMC-w2v,0.012,0.988,0.009,119,137,256,0.045,0.955,0.035,352,47,399,0.035,0.965,0.028,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.011,0.989,0.01,75,181,256,0.015,0.985,0.013,192,207,399,0.016,0.984,0.014,256,381,637
PubMed-and-PMC-w2v,0.015,0.985,0.01,120,136,256,0.038,0.962,0.027,353,46,399,0.031,0.969,0.022,460,177,637
GoogleNews-vectors-negative300,0.008,0.992,0.006,110,146,256,0.02,0.98,0.014,337,62,399,0.017,0.983,0.012,434,203,637
PubMed-w2v,0.015,0.985,0.011,120,136,256,0.04,0.96,0.029,353,46,399,0.032,0.968,0.024,460,177,637
tweetsvec,0.012,0.988,0.01,63,193,256,0.014,0.986,0.012,168,231,399,0.017,0.983,0.015,221,416,637


In [68]:
df_ks_med[5]

Unnamed: 0_level_0,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k,max_k
Unnamed: 0_level_1,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_rel,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_paper,seed_union,seed_union,seed_union,seed_union,seed_union,seed_union
Unnamed: 0_level_2,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed,pos_dcg,neg_dcg,perc_dcg,iov,oov,#seed
claims_cuis_hs_300.txt,0.01,0.99,0.008,47,209,256,0.0,1.0,0.0,10,389,399,0.005,0.995,0.004,53,584,637
stanford_cuis_svd_300.txt,0.0,1.0,0.0,1,256,256,0.0,1.0,0.0,1,399,399,0.0,1.0,0.0,1,637,637
cui2vec_pretrained,0.023,0.977,0.02,136,120,256,0.019,0.981,0.015,185,214,399,0.02,0.98,0.017,309,328,637
DeVine_etal_200,0.021,0.979,0.018,120,136,256,0.014,0.986,0.01,173,226,399,0.014,0.986,0.011,281,356,637
PMC-w2v,0.007,0.993,0.005,119,137,256,0.021,0.979,0.017,352,47,399,0.015,0.985,0.012,458,179,637
Health_2.5mreviews.s200.w10.n5.v15.cbow,0.008,0.992,0.007,75,181,256,0.01,0.99,0.009,192,207,399,0.009,0.991,0.008,256,381,637
PubMed-and-PMC-w2v,0.009,0.991,0.006,120,136,256,0.015,0.985,0.011,353,46,399,0.011,0.989,0.008,460,177,637
GoogleNews-vectors-negative300,0.005,0.995,0.004,110,146,256,0.006,0.994,0.004,337,62,399,0.005,0.995,0.003,434,203,637
PubMed-w2v,0.009,0.991,0.006,120,136,256,0.015,0.985,0.01,353,46,399,0.01,0.99,0.007,460,177,637
tweetsvec,0.011,0.989,0.01,63,193,256,0.009,0.991,0.008,168,231,399,0.012,0.988,0.01,221,416,637


The previous tables show results stored at the variable: `Utilities/big_g_medoid_2022-04-02 23:59:58.pickle`.

The choosen heuristic for picking the label representative for each concept is the `med`: aka the medoid element, between all the in-vocabulary labels, per each concept, where the distance is computed using the *cosine distance* 