In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
"""Computing the score 
    score(e;q,m) = P (e|q,m) ∝ P (e|m)P (q|e) 
"""
from termcolor import colored
from tabulate import tabulate
import pickle5 as pickle

import time
from dkoulinker.entity_linker import EntityLinker, get_mentions_ner
from flair.models import SequenceTagger
from dkoulinker.entity_ranking import DictionaryRanking, QueryEntityRanking
from dkoulinker.utils import _print_colorful_text


In [3]:
#loading dicitonary of commonness,
print('Loading mention2pem dictionary ...')
handle = open('../data/pem/pem.pickle', 'rb')
mention2pem = pickle.load(handle)


print('Loading entity description dictionary ...')
handle_desc = open('../data/pem/entity2description.pickle', 'rb')
entity2description = pickle.load(handle_desc)
print('NUmber of entities: ', len(entity2description))

print('Loading dictionary of term frequency ...')
handle_desc = open('../data/pem/mention_freq.pickle', 'rb')
mention2freq = pickle.load(handle_desc)
print('Number of term in the collection: ', len(mention2freq))

#given by create_term_req
collection_size_terms = len(mention2pem)


Loading mention2pem dictionary ...
Loading entity description dictionary ...
NUmber of entities:  2680002
Loading dictionary of term frequency ...
Number of term in the collection:  3506008


In [6]:
list(mention2pem['reproduction'].keys())

['GO:0000003']

In [7]:
list(mention2pem['reproduction'].values())


[1.0]

In [6]:
entity2description['GO:0005578']


'#UNK'

In [7]:
entity2description['GO:0031012']


('structure',
 'cells',
 'biochemical',
 'tissues',
 'biomechanical',
 'lying',
 'proteinaceous',
 'structural',
 'extracellular',
 'support',
 'matrisome',
 'cues',
 'matrix',
 'external',
 'provides')

In [4]:
# load the NER tagger
tagger = SequenceTagger.load(
    '../resources/taggers/sota-ner-flair/best-model.pt')


2021-09-27 15:34:54,484 loading file ../resources/taggers/sota-ner-flair/best-model.pt


In [5]:
text='quaternary ammonium salt that is the monoiodide. sexual reproduction'
get_mentions_ner(text,tagger)

(['quaternary ammonium salt'],
 [{'text': 'quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}])

In [6]:
dictionarysearch_strategy = DictionaryRanking(mention2pem) 
queryranking_strategy = QueryEntityRanking(
    entity2description=entity2description,
    mention_freq=mention2freq,
    mention2pem=mention2pem
    )
e_linker = EntityLinker(
    ranking_strategy=queryranking_strategy,
    ner_model=tagger,
    mention2pem=mention2pem,
    prune_overlapping_method='large_text'
    )


In [8]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text,samples)
# samples


a quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



In [9]:
from dkoulinker.utils import create_html_entities

In [11]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
create_html_entities(text, samples)


'<div class="entities" style="line-height: 2.5 direction: ltr">a quaternary <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">ammonium <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">BIO</span>\n     <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">salt <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">BIO</span>\n     that is the monoiodide</div>'

In [12]:
text='sexual reproduction'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)



sexual [41m[30mreproduction[0m



In [13]:
text='Quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)
samples


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



[{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19},
 {'text': 'salt', 'start_pos': 20, 'end_pos': 24}]

In [14]:
text = 'Quaternary ammonium salt that is the monoiodide. Sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


Quaternary [41m[30mammonium salt[0m that is the monoiodide. Sexual [42m[30mreproduction[0m



[{'text': 'ammonium salt',
  'end_pos': 24,
  'start_pos': 11,
  'entities': [('CHEBI:47704', 35.25677820125997)],
  'best_entity': ('CHEBI:47704', 35.25677820125997)},
 {'text': 'reproduction',
  'end_pos': 68,
  'start_pos': 56,
  'entities': [('GO:0000003', 5.114009083705022)],
  'best_entity': ('GO:0000003', 5.114009083705022)}]

In [15]:
text='quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



[41m[30mquaternary ammonium salt[0m that is the monoiodide that can be found in some [42m[30mspecies[0m with sexual [43m[30mreproduction[0m [44m[30mreduced[0m



[{'text': 'quaternary ammonium salt',
  'end_pos': 24,
  'start_pos': 0,
  'entities': [('CHEBI:35273', 8.303132223775105)],
  'best_entity': ('CHEBI:35273', 8.303132223775105)},
 {'start_pos': 74,
  'text': 'species',
  'end_pos': 81,
  'entities': [('OBI:0100026', 0.998766954377312),
   ('MCCV_000003', 0.002679679196868865),
   ('NCBITaxon:species', 0.0013398395984344324)],
  'best_entity': ('OBI:0100026', 0.998766954377312)},
 {'end_pos': 106,
  'text': 'reproduction',
  'start_pos': 94,
  'entities': [('GO:0000003', 2.968299120520138)],
  'best_entity': ('GO:0000003', 2.968299120520138)},
 {'text': 'reduced',
  'end_pos': 114,
  'start_pos': 107,
  'entities': [('PATO:0000587', 1.919248235352937),
   ('PATO:0001997', 1.2794988235686249)],
  'best_entity': ('PATO:0000587', 1.919248235352937)}]

In [16]:
text='A molecular entity having a net electric.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



A [41m[30mmolecular entity[0m having a [42m[30mnet[0m electric.



[{'start_pos': 2,
  'end_pos': 18,
  'text': 'molecular entity',
  'entities': [('CHEBI:23367', 2.667553377723604)],
  'best_entity': ('CHEBI:23367', 2.667553377723604)},
 {'text': 'net',
  'end_pos': 31,
  'start_pos': 28,
  'entities': [('GO:0140644', 6.664883535521932)],
  'best_entity': ('GO:0140644', 6.664883535521932)}]

In [176]:
text="there's a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



there's a [41m[30mmolecular entity[0m with electric charge



[{'end_pos': 26,
  'text': 'molecular entity',
  'start_pos': 10,
  'entities': [('CHEBI:23367', 4.356820601798991)],
  'best_entity': ('CHEBI:23367', 4.356820601798991)}]

In [177]:
text = "theres a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



there[41m[30ms[0m a [42m[30mmolecular entity[0m with electric charge



[{'text': 's',
  'start_pos': 5,
  'end_pos': 6,
  'entities': [('CHEBI:29999', 2.95759679036954),
   ('CHEBI:17115', 0.3924854915524766),
   ('CHEBI:17909', 0.5062741561321537),
   ('CHEBI:26833', 5.185600936147622),
   ('CHEBI:36368', 0.4814570585727065),
   ('UO:0000010', 0.4064118505055581)],
  'best_entity': ('CHEBI:26833', 5.185600936147622)},
 {'start_pos': 9,
  'end_pos': 25,
  'text': 'molecular entity',
  'entities': [('CHEBI:23367', 3.245904530608999)],
  'best_entity': ('CHEBI:23367', 3.245904530608999)}]

In [178]:
text = 'The fluoride salt of beryllium (+2 oxidation state). In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres. As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m). In the [44m[30msolid[0m [45m[30mstate[0m it exists as a [46m[30mglass[0m, with four-[41m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[42m[30mcoordinate[0m [43m[30mfluoride[0m centres. As a [44m[30mgas[0m it adopts a [45m[30mlinear[0m triatomic [46m[30mstructure[0m and in the [41m[30mliquid[0m [42m[30mstate[0m a [43m[30mfluctuating[0m tetrahedral [44m[30mstructure[0m. In [45m[30mprotein[0m crystallography it is used as a mimic of [46m[30mphosphate[0m.



[{'text': 'fluoride salt',
  'start_pos': 4,
  'end_pos': 17,
  'entities': [('CHEBI:24060', 2.216991408264466)],
  'best_entity': ('CHEBI:24060', 2.216991408264466)},
 {'start_pos': 21,
  'text': 'beryllium',
  'end_pos': 30,
  'entities': [('CHEBI:33783', 0.49010475837672285),
   ('CHEBI:30501', 0.9624891694868001)],
  'best_entity': ('CHEBI:30501', 0.9624891694868001)},
 {'text': 'state',
  'start_pos': 45,
  'end_pos': 50,
  'entities': [('SIO:000662', 1.671060798158725)],
  'best_entity': ('SIO:000662', 1.671060798158725)},
 {'start_pos': 60,
  'end_pos': 65,
  'text': 'solid',
  'entities': [('PATO:0001736', 1.4661171813441671)],
  'best_entity': ('PATO:0001736', 1.4661171813441671)},
 {'text': 'state',
  'start_pos': 66,
  'end_pos': 71,
  'entities': [('SIO:000662', 1.671060798158725)],
  'best_entity': ('SIO:000662', 1.671060798158725)},
 {'end_pos': 92,
  'text': 'glass',
  'start_pos': 87,
  'entities': [('CHEBI:131189', 1.5247850072847873)],
  'best_entity': ('CHEBI:131189'

In [179]:
text = 'The fluoride salt of beryllium (+2 oxidation state)'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m)



[{'text': 'fluoride salt',
  'start_pos': 4,
  'end_pos': 17,
  'entities': [('CHEBI:24060', 24.157725392041723)],
  'best_entity': ('CHEBI:24060', 24.157725392041723)},
 {'start_pos': 21,
  'text': 'beryllium',
  'end_pos': 30,
  'entities': [('CHEBI:33783', 1.557827766492502),
   ('CHEBI:30501', 2.8963909526248814)],
  'best_entity': ('CHEBI:30501', 2.8963909526248814)},
 {'text': 'state',
  'start_pos': 45,
  'end_pos': 50,
  'entities': [('SIO:000662', 7.7977445606475895)],
  'best_entity': ('SIO:000662', 7.7977445606475895)}]

In [180]:
text='In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



In the [41m[30msolid[0m [42m[30mstate[0m it exists as a [43m[30mglass[0m, with four-[44m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[45m[30mcoordinate[0m [46m[30mfluoride[0m centres.



[{'start_pos': 7,
  'end_pos': 12,
  'text': 'solid',
  'entities': [('PATO:0001736', 2.6483466469593147)],
  'best_entity': ('PATO:0001736', 2.6483466469593147)},
 {'text': 'state',
  'start_pos': 13,
  'end_pos': 18,
  'entities': [('SIO:000662', 3.695018687518561)],
  'best_entity': ('SIO:000662', 3.695018687518561)},
 {'end_pos': 39,
  'start_pos': 34,
  'text': 'glass',
  'entities': [('CHEBI:131189', 2.926504503864421)],
  'best_entity': ('CHEBI:131189', 2.926504503864421)},
 {'start_pos': 51,
  'end_pos': 61,
  'text': 'coordinate',
  'entities': [('SIO:000071', 3.0959191617417847)],
  'best_entity': ('SIO:000071', 3.0959191617417847)},
 {'end_pos': 107,
  'text': 'coordinate',
  'start_pos': 97,
  'entities': [('SIO:000071', 3.0959191617417847)],
  'best_entity': ('SIO:000071', 3.0959191617417847)},
 {'end_pos': 116,
  'start_pos': 108,
  'text': 'fluoride',
  'entities': [('CHEBI:17051', 2.1396226507321248),
   ('CHEBI:29228', 0.49089569909171404)],
  'best_entity': ('CHEBI:17

In [181]:
text = ' As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



 As a [41m[30mgas[0m it adopts a [42m[30mlinear[0m triatomic [43m[30mstructure[0m and in the [44m[30mliquid[0m [45m[30mstate[0m a [46m[30mfluctuating[0m tetrahedral [41m[30mstructure[0m.



[{'end_pos': 9,
  'text': 'gas',
  'start_pos': 6,
  'entities': [('GO:0034005', 1.3338438543366664),
   ('PATO:0001737', 1.3646683726681752)],
  'best_entity': ('PATO:0001737', 1.3646683726681752)},
 {'text': 'linear',
  'end_pos': 28,
  'start_pos': 22,
  'entities': [('PATO:0001199', 2.5466378518061417)],
  'best_entity': ('PATO:0001199', 2.5466378518061417)},
 {'text': 'structure',
  'end_pos': 48,
  'start_pos': 39,
  'entities': [('PATO:0000141', 2.0477501927955255),
   ('SIO:000600', 1.9116027456112858)],
  'best_entity': ('PATO:0000141', 2.0477501927955255)},
 {'text': 'liquid',
  'end_pos': 66,
  'start_pos': 60,
  'entities': [('PATO:0001735', 3.4693673511428025)],
  'best_entity': ('PATO:0001735', 3.4693673511428025)},
 {'text': 'state',
  'start_pos': 67,
  'end_pos': 72,
  'entities': [('SIO:000662', 3.695018687518561)],
  'best_entity': ('SIO:000662', 3.695018687518561)},
 {'text': 'fluctuating',
  'start_pos': 75,
  'end_pos': 86,
  'entities': [('PATO:0002374', 3.257496

In [182]:
text = 'In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



In [41m[30mprotein[0m crystallography it is used as a mimic of [42m[30mphosphate[0m.



[{'start_pos': 3,
  'text': 'protein',
  'end_pos': 10,
  'entities': [('CHEBI:16541', 1.3167385107698184),
   ('CHEBI:36080', 74.92643092660032),
   ('PR:000000001', 769.3784359599088),
   ('SIO:010043', 1.9957936081969099)],
  'best_entity': ('PR:000000001', 769.3784359599088)},
 {'end_pos': 61,
  'start_pos': 52,
  'text': 'phosphate',
  'entities': [('CHEBI:35780', 2.2874678944170217),
   ('CHEBI:18367', 1.2603223872408613),
   ('CHEBI:43474', 0.594929892066507),
   ('CHEBI:26020', 402.9568072100091),
   ('CHEBI:26078', 0.5398832131867083)],
  'best_entity': ('CHEBI:26020', 402.9568072100091)}]

In [183]:
text = """The process in which an antigen-presenting cell expresses peptide antigen in 
association with an MHC protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the MHC protein complex.
 The peptide antigen is typically, but not always, processed from an endogenous or exogenous protein."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mprocess[0m in which an [42m[30mantigen[0m-presenting [43m[30mcell[0m expresses [44m[30mpeptide[0m [45m[30mantigen[0m in 
[46m[30massociation[0m with an MHC [41m[30mprotein[0m [42m[30mcomplex[0m on its [43m[30mcell[0m [44m[30msurface[0m, including [45m[30mproteolysis[0m and 
[46m[30mtransport[0m steps for the [41m[30mpeptide[0m [42m[30mantigen[0m both prior to and following assembly with the MHC [43m[30mprotein[0m [44m[30mcomplex[0m.
 The [45m[30mpeptide antigen[0m is typically, but not always, processed from an endogenous or exogenous [46m[30mprotein[0m.



[{'text': 'process',
  'start_pos': 4,
  'end_pos': 11,
  'entities': [('BFO:0000015', 1.2460255469628874),
   ('UBERON:0004529', 0.08467235649729467),
   ('ProcessualEntity', 0.046147671421455524),
   ('SIO:000006', 0.5114700249211321),
   ('ZFA:0001637', 0.017610971339665847)],
  'best_entity': ('BFO:0000015', 1.2460255469628874)},
 {'end_pos': 31,
  'start_pos': 24,
  'text': 'antigen',
  'entities': [('CHEBI:59132', 2.3005372039812224),
   ('SIO:010419', 0.27572366689345784)],
  'best_entity': ('CHEBI:59132', 2.3005372039812224)},
 {'start_pos': 43,
  'text': 'cell',
  'end_pos': 47,
  'entities': [('CL:0000000', 1.3578640961840265),
   ('GO:0005623', 0.668631010120962),
   ('SIO:010001', 0.005618747984209765),
   ('ZFA:0009000', 0.6449675683419837)],
  'best_entity': ('CL:0000000', 1.3578640961840265)},
 {'end_pos': 65,
  'start_pos': 58,
  'text': 'peptide',
  'entities': [('CHEBI:16670', 1.068335250280117),
   ('PR:000018263', 0.0703576678098971),
   ('PR:000018264', 0.000391964

In [184]:
tiger = 'A tiger is a carnovire of the mammal family.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mprocess[0m in which an [42m[30mantigen[0m-presenting [43m[30mcell[0m expresses [44m[30mpeptide[0m [45m[30mantigen[0m in 
[46m[30massociation[0m with an MHC [41m[30mprotein[0m [42m[30mcomplex[0m on its [43m[30mcell[0m [44m[30msurface[0m, including [45m[30mproteolysis[0m and 
[46m[30mtransport[0m steps for the [41m[30mpeptide[0m [42m[30mantigen[0m both prior to and following assembly with the MHC [43m[30mprotein[0m [44m[30mcomplex[0m.
 The [45m[30mpeptide antigen[0m is typically, but not always, processed from an endogenous or exogenous [46m[30mprotein[0m.



[{'text': 'process',
  'start_pos': 4,
  'end_pos': 11,
  'entities': [('BFO:0000015', 1.2460255469628874),
   ('UBERON:0004529', 0.08467235649729467),
   ('ProcessualEntity', 0.046147671421455524),
   ('SIO:000006', 0.5114700249211321),
   ('ZFA:0001637', 0.017610971339665847)],
  'best_entity': ('BFO:0000015', 1.2460255469628874)},
 {'end_pos': 31,
  'start_pos': 24,
  'text': 'antigen',
  'entities': [('CHEBI:59132', 2.3005372039812224),
   ('SIO:010419', 0.27572366689345784)],
  'best_entity': ('CHEBI:59132', 2.3005372039812224)},
 {'start_pos': 43,
  'text': 'cell',
  'end_pos': 47,
  'entities': [('CL:0000000', 1.3578640961840265),
   ('GO:0005623', 0.668631010120962),
   ('SIO:010001', 0.005618747984209765),
   ('ZFA:0009000', 0.6449675683419837)],
  'best_entity': ('CL:0000000', 1.3578640961840265)},
 {'end_pos': 65,
  'start_pos': 58,
  'text': 'peptide',
  'entities': [('CHEBI:16670', 1.068335250280117),
   ('PR:000018263', 0.0703576678098971),
   ('PR:000018264', 0.000391964

In [185]:

text = """Natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves. 
However, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide. 
The conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years. 
One of the most challenging processes of high industrial importance is the conversion of methane to methanol (CHEBI:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2]."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



Natural [41m[30mgas[0m consists primarily of [42m[30mmethane[0m that is clean, non-[43m[30mtoxic[0m, and has abundant natural reserves. 
However, [44m[30mmethane[0m is also a greenhouse [45m[30mgas[0m whose greenhouse effect is more than 20 times than that of [46m[30mcarbon[0m dioxide. 
The conversion of [41m[30mmethane[0m into other value-added chemicals has been an important research [42m[30marea[0m in the [43m[30mfield[0m of catalysis for many years. 
One of the most challenging [44m[30mprocesses[0m of high industrial importance is the conversion of [45m[30mmethane[0m to [46m[30mmethanol[0m (CHEBI:17790), 
a [41m[30msimple[0m [42m[30malcohol[0m that is [43m[30mliquid[0m under ambient conditions and can be easily stored and transported compared to [44m[30mmethane[0m [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2].



[{'text': 'gas',
  'end_pos': 11,
  'start_pos': 8,
  'entities': [('GO:0034005', 0.6022665026762707),
   ('PATO:0001737', 0.604881766020379)],
  'best_entity': ('PATO:0001737', 0.604881766020379)},
 {'end_pos': 41,
  'start_pos': 34,
  'text': 'methane',
  'entities': [('CHEBI:16183', 1.952654927831163)],
  'best_entity': ('CHEBI:16183', 1.952654927831163)},
 {'text': 'toxic',
  'start_pos': 61,
  'end_pos': 66,
  'entities': [('SIO:001009', 1.2813017861774334)],
  'best_entity': ('SIO:001009', 1.2813017861774334)},
 {'end_pos': 120,
  'start_pos': 113,
  'text': 'methane',
  'entities': [('CHEBI:16183', 1.952654927831163)],
  'best_entity': ('CHEBI:16183', 1.952654927831163)},
 {'end_pos': 145,
  'text': 'gas',
  'start_pos': 142,
  'entities': [('GO:0034005', 0.6022665026762707),
   ('PATO:0001737', 0.604881766020379)],
  'best_entity': ('PATO:0001737', 0.604881766020379)},
 {'end_pos': 211,
  'text': 'carbon',
  'start_pos': 205,
  'entities': [('CHEBI:33415', 0.9326872728620195),
