In [1]:
import analogy_pipeline as ap
from colors import colors
import data_visualization as dv
import datetime
import measures, umls_tables_processing, utils
from collections import defaultdict

from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath

import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import numpy as np
import os
import pandas as pd


0:00:18
0:00:00


Matplotlib created a temporary config/cache directory at /tmp/matplotlib-lvtst66x because the default path (/home/salvatore/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


0:00:00


### Processing MRREL table: elaboration of relationships and evaluation of which is more proper

In [37]:
_ = umls_tables_processing.count_relationships(rel_type = 'RELA')

844
0:01:06


This is the number of all the relationships `RELA` in the whole UMLS: they correspond to the particular relationships

In [3]:
umls_tables_processing.count_relationships(rel_type = 'REL')

10
0:02:41


{'AQ', 'CHD', 'PAR', 'QB', 'RB', 'RN', 'RO', 'RQ', 'SIB', 'SY'}

These are the `REL` relationships and the total number: they correspond to generical relationships

The following lines tell us which are the relationship which link the 256 elements of the seed with the concept `COPD`: this is an evolution of the original seed building method.

In [4]:
double_rel = umls_tables_processing.concepts_related_to_concept(two_way = True, polishing_rels = True, switch_key= 'con')

Relation '' discarded 


In [5]:
single_rel = umls_tables_processing.concepts_related_to_concept(two_way = False, polishing_rels = True, switch_key= 'con')

Relation '' discarded 


The two lines distinguish the both ways relationships from the one way.

#### Double ways

In [6]:
_ = set([i for j in double_rel.values() for i in j])
print(_)
print(len(_))

{'disease_has_associated_gene', 'may_be_treated_by', 'disease_has_associated_anatomic_site', 'focus_of', 'is_associated_anatomic_site_of', 'fragments_for_synonyms_of', 'contraindicated_with_disease', 'has_fragments_for_synonyms', 'was_a', 'classified_as', 'same_as', 'has_finding_site', 'possibly_equivalent_to', 'inverse_was_a', 'inverse_isa', 'finding_site_of', 'has_associated_morphology', 'entry_version_of', 'see_from', 'has_course', 'has_cdrh_parent', 'has_focus', 'replaces', 'has_expanded_form', 'expanded_form_of', 'cdrh_parent_of', 'has_entry_version', 'associated_with_malfunction_of_gene_product', 'associated_morphology_of', 'subset_includes_concept', 'clinical_course_of', 'use', 'related_to', 'isa', 'used_for', 'gene_associated_with_disease', 'has_answer', 'has_contraindicated_drug', 'answer_to', 'has_associated_finding', 'replaced_by', 'has_clinical_course', 'mapped_from', 'concept_in_subset', 'may_treat', 'classifies', 'gene_product_malfunction_associated_with_disease', 'see', 

#### One way

In [7]:
_ = set([i for j in single_rel.values() for i in j])
print(_)
print(len(_))

{'is_associated_anatomic_site_of', 'contraindicated_with_disease', 'has_fragments_for_synonyms', 'was_a', 'classified_as', 'same_as', 'possibly_equivalent_to', 'inverse_was_a', 'inverse_isa', 'finding_site_of', 'has_focus', 'entry_version_of', 'see_from', 'has_cdrh_parent', 'has_expanded_form', 'expanded_form_of', 'cdrh_parent_of', 'has_entry_version', 'associated_morphology_of', 'subset_includes_concept', 'clinical_course_of', 'use', 'related_to', 'isa', 'used_for', 'gene_associated_with_disease', 'has_answer', 'has_associated_finding', 'replaced_by', 'mapped_from', 'may_treat', 'gene_product_malfunction_associated_with_disease', 'see', 'mapped_to', 'has_manifestation', 'course_of'}
36


#### Building the seed for analogic evaluation: we obtain lists of CUIs for each relationship in which they're involved with COPD

In [4]:
seed_analog_both = umls_tables_processing.concepts_related_to_concept(two_way = True, switch_key = 'rel', extract_labels = True)
print(len(seed_analog_both))
#seed_analog_both

0:02:15
256
Time for extracting labels: 0:00:00
Extracting time: 0:02:15
Building seed time: 0:03:37
54


In [4]:
seed_analog_one = umls_tables_processing.concepts_related_to_concept(two_way = False, switch_key = 'rel')
print(len(seed_analog_one))
#seed_analog_one

0:00:52
37


In place of building the list of copd-related CUIs, the seed_analog_one is re-elaborated for returning a list of concepts.

The seed_analog_one is used just for the reason it has not extracted labels, so it is more comfortable handling it.

In [6]:
tmp = [v for k,v in seed_analog_one.items()]
concepts = list(set([i for j in tmp for i in j]))

In counting the number of RELA relationships, the second approach return one more relation: it is due to the presence of the emtpy relation `''`

The `''` relation is problematic because contains plenty of key-concepts for our case and it is more similar to the concept of `relatedness`

The two sets of L and K are built: the boundary for the first is just the presence of relationships COPD-related.

In [7]:
all_copd_relations = list(seed_analog_both.keys())

### A subset of the 53 relationships extracted from seed, is detected

#### Used relationships (RELA)

They are choosen by hand

In [9]:
useful_rela = umls_tables_processing.USEFUL_RELA
useful_rela.append('')
useful_rela

['associated_finding_of',
 'associated_morphology_of',
 'associated_with_malfunction_of_gene_product',
 'clinical_course_of',
 'contraindicated_with_disease',
 'course_of',
 'disease_has_associated_anatomic_site',
 'disease_has_associated_gene',
 'finding_site_of',
 'gene_associated_with_disease',
 'gene_product_malfunction_associated_with_disease',
 'has_associated_finding',
 'has_associated_morphology',
 'has_clinical_course',
 'has_contraindicated_drug',
 'has_course',
 'has_finding_site',
 'has_manifestation',
 'is_associated_anatomic_site_of',
 'manifestation_of',
 'may_be_treated_by',
 'may_treat',
 '']

The experimentation was splitted for cuis or words embeddings only: this was moreover a technicality for allowing the computation in several sessions.

### Type of embeddings: choose between `both`, `cuis` or `words`

In [2]:
embedding_type = ['both', 'cuis', 'words']

Our implementation allows the computation for `3CosAdd`, `3CosMul` and `PairDirection` according the formulation by `Levy, Omer, and Yoav Goldberg. "Linguistic regularities in sparse and explicit word representations." Proceedings of the eighteenth conference on computational natural language learning. 2014.` 

### Type of measures: choose between `all`, `add` for 3CosAdd, `mul` for 3CosMul and `pair` for PairDirection

In [3]:
metrics = {'add': [measures.cos3add, 10],
            'mul': [measures.cos3mul, 0.0001], 
            'pair': [measures.pair_direction, 0.0001]}

Starting from one seed list - we chose `COPD` seed-, is built a set of all the pairs in which at least one of the two elements is inside the seed. The pairs have to be linked by one of the relationship in `umls_tables_processing.USEFUL_RELA`

### Instantiate `W_umls` analogy seed

In [4]:
# CUIs 
concepts = umls_tables_processing.concepts_related_to_concept(concept = umls_tables_processing.COPD, two_way = True)
W_umls = umls_tables_processing.count_pairs(umls_tables_processing.USEFUL_RELA, cuis_list = concepts)

Building seed time: 0:00:58
22
Building pairs set time: 0:01:28


`L_umls` temporary variable

In [5]:
L_umls = W_umls

### Constants

In [6]:
PATH_EMBEDDINGS = './Embeddings'
analog_comp_dict = {}

#### Loading embedding

In [7]:
cuis = ('/cuis/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/cuis') if (f.is_file())&(f.name != 'README.md')])
words = ('/words/', [f.name for f in os.scandir(PATH_EMBEDDINGS+'/words') if (f.is_file())&(f.name != 'README.md')])
embeddings = [cuis, words]

#### Check `analogy_pipeline.py`: multiprocessing implementation, for simultaneous loading and evaluation of embeddings

Following only a toy example of our analogical assessment pipeline: high computational cost and long time computation for running it on a jupyter notebook.

The following lines of code are taken from `analogy_pipeline.py` and they correspond to the method `analog_loop`

For assessing the procedure, only the embedding `claim_cuis` from `Choi, Youngduck, Chill Yi-I. Chiu, and David Sontag. "Learning low-dimensional representations of medical concepts." AMIA Summits on Translational Science Proceedings 2016 (2016): 41.` is considered

In [9]:
emb = 'claims_cuis_hs_300.txt.gz'
path = '/cuis/' + emb

To notice: the following code lines correspond to CUIs embedding processing. See `analog_loop` in `analogy_pipeline.py` script for further processing details.

In [28]:
# Embedding name
name = emb

# Load the w2v model
model = KeyedVectors.load_word2vec_format(PATH_EMBEDDINGS+path, binary=emb.endswith('.bin'))
print('\n\n The name of embedding is: %s\n' % name)

# Instantiation
analog_comp_dict[name] = {}
dict_t = {}
dict_t[name] = {}

# Loop over the relations
for rela in umls_tables_processing.USEFUL_RELA:
    print('\n The RELA is: %s\n' % rela)

    c = datetime.datetime.now().replace(microsecond=0)
    l0, k0 = measures.k_n_l_iov(L_umls[rela], 
                                W_umls[rela],
                                model, 
                                logger = None,
                                emb_type = 'cui')

    # Compute the analogy and store the results
    tmp = measures.analogy_compute(l0, k0, 
                                   model,
                                   metrics,
                                   logger = None,
                                   emb_type = 'cui')
    dict_t[name][rela] = tmp                    


    # Log of end of 'relation' operation
    print('The time for RELA ' + str(rela) + ', for embedding '+str(name)+' is '+str(datetime.datetime.now().replace(microsecond=0)-c))




 The name of embedding is: claims_cuis_hs_300.txt


 The RELA is: associated_finding_of

(2, 437)
L=k
0:00:00
15
(10, 2)
(10, 2)
0:00:00
At couple number 1/10

0:00:00
At couple number 2/10

0:00:00
At couple number 3/10

0:00:00
At couple number 4/10

0:00:00
At couple number 5/10

0:00:00
At couple number 6/10

0:00:00
At couple number 7/10

0:00:00
At couple number 8/10

0:00:00
At couple number 9/10

0:00:00
At couple number 10/10

0:00:00
The time for RELA associated_finding_of, for embedding claims_cuis_hs_300.txt is 0:00:00

 The RELA is: associated_morphology_of

(2, 5253)
L=k
0:00:00
28
(24, 2)
(24, 2)
0:00:00
At couple number 2/24

0:00:00
At couple number 4/24

0:00:00
At couple number 6/24

0:00:00
At couple number 8/24

0:00:00
At couple number 10/24

0:00:00
At couple number 12/24

0:00:00
At couple number 14/24

0:00:00
At couple number 16/24

0:00:00
At couple number 18/24

0:00:00
At couple number 20/24

0:00:00
At couple number 22/24

0:00:00
At couple number 24/24


### Loading the vocabulary of UMLS: CUI as key and preferred label as value

In [12]:
dict_conso = umls_tables_processing.cui_strings(all_labels = False)


0:01:13


#### Showing relationships and couples: toy example

In [14]:
for n, i in enumerate(W_umls['may_treat']):
    if n<10:
        print(dict_conso[i[0]][0] +' - may treat - '+ dict_conso[i[1]][0])

epinephrine hydrochloride - may treat - Airway Obstruction
epinephrine - may treat - Airway Obstruction
epinephrine sulfate - may treat - Airway Obstruction
epinephryl borate - may treat - Airway Obstruction
epinephrine bitartrate - may treat - Airway Obstruction
racepinephrine hydrochloride - may treat - Airway Obstruction
galantamine - may treat - Alzheimer's Disease
memantine - may treat - Alzheimer's Disease
selegiline - may treat - Alzheimer's Disease
tacrine - may treat - Alzheimer's Disease


### Data loading for data visualization

In [2]:
tmp = ap.processing_analog_pipe_outcome(operations = ['add', 'mul', 'pair'], all_ = True)

0:00:30
0:00:00
0:00:05
0:00:44
0:00:00
0:00:00
0:00:27
0:00:25
0:00:07
0:01:00
0:00:00


In [15]:
app = dv.table_analog_results(tmp, 'add', '3CosAdd')


In [17]:
app[0]

Unnamed: 0_level_0,3CosAdd | PMC-w2v,3CosAdd | PMC-w2v,3CosAdd | PMC-w2v,3CosAdd | PMC-w2v
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.0,0,9,248
associated_morphology_of,4e-05,189,97,2286
associated_with_malfunction_of_gene_product,0.0142,15,27,33
clinical_course_of,0.0,0,71,1903
contraindicated_with_disease,0.00054,357,232,811
course_of,0.00016,242,68,1236
disease_has_associated_anatomic_site,1e-05,65,78,2387
disease_has_associated_gene,0.01647,368,105,150
finding_site_of,3e-05,584,214,4559
gene_associated_with_disease,0.00107,24,105,150


In [18]:
app[1]

Unnamed: 0_level_0,3CosAdd | claims_cuis_hs_300.txt,3CosAdd | claims_cuis_hs_300.txt,3CosAdd | claims_cuis_hs_300.txt,3CosAdd | claims_cuis_hs_300.txt
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,2e-05,1,10,248
associated_morphology_of,0.0,0,24,2286
associated_with_malfunction_of_gene_product,0.0,0,0,33
clinical_course_of,0.0,0,0,1903
contraindicated_with_disease,0.00049,321,122,811
course_of,0.0,0,0,1236
disease_has_associated_anatomic_site,0.0,0,0,2387
disease_has_associated_gene,0.0,0,0,150
finding_site_of,0.0,0,0,4559
gene_associated_with_disease,0.0,0,0,150


In [19]:
app[2]

Unnamed: 0_level_0,3CosAdd | DeVine_etal_200,3CosAdd | DeVine_etal_200,3CosAdd | DeVine_etal_200,3CosAdd | DeVine_etal_200
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.00042,26,23,248
associated_morphology_of,0.00036,1896,527,2286
associated_with_malfunction_of_gene_product,0.0,0,6,33
clinical_course_of,0.0,0,316,1903
contraindicated_with_disease,0.00251,1649,400,811
course_of,0.00138,2101,315,1236
disease_has_associated_anatomic_site,0.00049,2812,598,2387
disease_has_associated_gene,0.0,0,0,150
finding_site_of,0.00023,4728,1030,4559
gene_associated_with_disease,0.0,0,0,150


In [20]:
app[3]

Unnamed: 0_level_0,3CosAdd | PubMed-and-PMC-w2v,3CosAdd | PubMed-and-PMC-w2v,3CosAdd | PubMed-and-PMC-w2v,3CosAdd | PubMed-and-PMC-w2v
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,2e-05,1,9,248
associated_morphology_of,7e-05,364,105,2286
associated_with_malfunction_of_gene_product,0.01799,19,27,33
clinical_course_of,0.0,0,75,1903
contraindicated_with_disease,0.00088,575,241,811
course_of,0.00016,245,72,1236
disease_has_associated_anatomic_site,2e-05,102,81,2387
disease_has_associated_gene,0.01606,359,108,150
finding_site_of,5e-05,943,230,4559
gene_associated_with_disease,0.00094,21,108,150


In [21]:
app[4]

Unnamed: 0_level_0,3CosAdd | stanford_cuis_svd_300.txt,3CosAdd | stanford_cuis_svd_300.txt,3CosAdd | stanford_cuis_svd_300.txt,3CosAdd | stanford_cuis_svd_300.txt
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0,0,0,248
associated_morphology_of,0,0,0,2286
associated_with_malfunction_of_gene_product,0,0,0,33
clinical_course_of,0,0,0,1903
contraindicated_with_disease,0,0,0,811
course_of,0,0,0,1236
disease_has_associated_anatomic_site,0,0,0,2387
disease_has_associated_gene,0,0,0,150
finding_site_of,0,0,0,4559
gene_associated_with_disease,0,0,0,150


In [22]:
app[5]

Unnamed: 0_level_0,3CosAdd | tweetsvec,3CosAdd | tweetsvec,3CosAdd | tweetsvec,3CosAdd | tweetsvec
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.0,0,4,248
associated_morphology_of,0.0,1,20,2286
associated_with_malfunction_of_gene_product,0.0,0,0,33
clinical_course_of,0.0,0,5,1903
contraindicated_with_disease,2e-05,11,44,811
course_of,0.0,0,8,1236
disease_has_associated_anatomic_site,0.0,21,15,2387
disease_has_associated_gene,0.0,0,1,150
finding_site_of,0.0,42,35,4559
gene_associated_with_disease,0.0,0,1,150


In [23]:
app[6]

Unnamed: 0_level_0,3CosAdd | wikipedia-pubmed-and-PMC-w2v,3CosAdd | wikipedia-pubmed-and-PMC-w2v,3CosAdd | wikipedia-pubmed-and-PMC-w2v,3CosAdd | wikipedia-pubmed-and-PMC-w2v
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,3e-05,2,9,248
associated_morphology_of,4e-05,222,106,2286
associated_with_malfunction_of_gene_product,0.0161,17,27,33
clinical_course_of,0.0,0,76,1903
contraindicated_with_disease,0.00085,561,241,811
course_of,0.00015,228,72,1236
disease_has_associated_anatomic_site,1e-05,80,81,2387
disease_has_associated_gene,0.01512,338,108,150
finding_site_of,3e-05,724,232,4559
gene_associated_with_disease,0.00103,23,108,150


In [24]:
app[7]

Unnamed: 0_level_0,3CosAdd | GoogleNews-vectors-negative300,3CosAdd | GoogleNews-vectors-negative300,3CosAdd | GoogleNews-vectors-negative300,3CosAdd | GoogleNews-vectors-negative300
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.0,0,8,248
associated_morphology_of,2e-05,95,78,2286
associated_with_malfunction_of_gene_product,0.01231,13,19,33
clinical_course_of,0.0,0,46,1903
contraindicated_with_disease,0.00014,91,163,811
course_of,3e-05,52,47,1236
disease_has_associated_anatomic_site,1e-05,39,66,2387
disease_has_associated_gene,0.00966,216,71,150
finding_site_of,1e-05,235,166,4559
gene_associated_with_disease,0.00018,4,71,150


In [25]:
app[8]

Unnamed: 0_level_0,3CosAdd | cui2vec_pretrained,3CosAdd | cui2vec_pretrained,3CosAdd | cui2vec_pretrained,3CosAdd | cui2vec_pretrained
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.00318,195,58,248
associated_morphology_of,0.00091,4778,583,2286
associated_with_malfunction_of_gene_product,0.00284,3,14,33
clinical_course_of,0.0,5,445,1903
contraindicated_with_disease,0.00284,1863,438,811
course_of,0.00182,2774,409,1236
disease_has_associated_anatomic_site,0.00058,3302,692,2387
disease_has_associated_gene,0.0,0,1,150
finding_site_of,0.00106,21935,1281,4559
gene_associated_with_disease,0.0,0,1,150


In [26]:
app[9]

Unnamed: 0_level_0,3CosAdd | PubMed-w2v,3CosAdd | PubMed-w2v,3CosAdd | PubMed-w2v,3CosAdd | PubMed-w2v
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.0,0,9,248
associated_morphology_of,5e-05,241,105,2286
associated_with_malfunction_of_gene_product,0.01799,19,27,33
clinical_course_of,0.0,0,73,1903
contraindicated_with_disease,0.00092,607,241,811
course_of,0.00011,172,71,1236
disease_has_associated_anatomic_site,2e-05,100,81,2387
disease_has_associated_gene,0.023,514,100,150
finding_site_of,4e-05,782,229,4559
gene_associated_with_disease,0.00107,24,100,150


In [27]:
app[10]

Unnamed: 0_level_0,3CosAdd | Health_2.5mreviews.s200.w10.n5.v15.cbow,3CosAdd | Health_2.5mreviews.s200.w10.n5.v15.cbow,3CosAdd | Health_2.5mreviews.s200.w10.n5.v15.cbow,3CosAdd | Health_2.5mreviews.s200.w10.n5.v15.cbow
Unnamed: 0_level_1,Mar,AR,|IV|,|W|
associated_finding_of,0.0,0,4,248
associated_morphology_of,0.0,16,31,2286
associated_with_malfunction_of_gene_product,0.0,0,1,33
clinical_course_of,0.0,0,11,1903
contraindicated_with_disease,0.00013,84,99,811
course_of,0.0,3,15,1236
disease_has_associated_anatomic_site,1e-05,43,22,2387
disease_has_associated_gene,0.00018,4,3,150
finding_site_of,1e-05,153,63,4559
gene_associated_with_disease,4e-05,1,3,150
