In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
"""Computing the score 
    score(e;q,m) = P (e|q,m) ∝ P (e|m)P (q|e) 
"""
from termcolor import colored
from tabulate import tabulate
import pickle5 as pickle

import time
from dkoulinker.entity_linker import EntityLinker, get_mentions_ner
from flair.models import SequenceTagger
from dkoulinker.entity_ranking import DictionaryRanking, QueryEntityRanking
from dkoulinker.utils import _print_colorful_text


In [3]:
#loading dicitonary of commonness,
print('Loading mention2pem dictionary ...')
handle = open('../data/pem/pem.pickle', 'rb')
mention2pem = pickle.load(handle)


print('Loading entity description dictionary ...')
handle_desc = open('../data/pem/entity2description.pickle', 'rb')
entity2description = pickle.load(handle_desc)
print('NUmber of entities: ', len(entity2description))

print('Loading dictionary of term frequency ...')
handle_desc = open('../data/pem/mention_freq.pickle', 'rb')
mention2freq = pickle.load(handle_desc)
print('Number of term in the collection: ', len(mention2freq))

#given by create_term_req
collection_size_terms = len(mention2pem)


Loading mention2pem dictionary ...
Loading entity description dictionary ...
NUmber of entities:  2680002
Loading dictionary of term frequency ...
Number of term in the collection:  3506008


In [4]:
list(mention2pem['reproduction'].keys())

['GO:0000003']

In [5]:
list(mention2pem['reproduction'].values())


[1.0]

In [6]:
entity2description['GO:0005578']


'#UNK'

In [7]:
entity2description['GO:0031012']


('structure',
 'cells',
 'biochemical',
 'tissues',
 'biomechanical',
 'lying',
 'proteinaceous',
 'structural',
 'extracellular',
 'support',
 'matrisome',
 'cues',
 'matrix',
 'external',
 'provides')

In [4]:
# load the NER tagger
tagger = SequenceTagger.load(
    '../resources/taggers/sota-ner-flair/best-model.pt')


2021-11-26 15:23:38,700 loading file ../resources/taggers/sota-ner-flair/best-model.pt


In [5]:
text='quaternary ammonium salt that is the monoiodide. sexual reproduction'
get_mentions_ner(text,tagger)

(['quaternary ammonium salt'],
 [{'text': 'quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}])

In [25]:
dictionarysearch_strategy = DictionaryRanking(mention2pem) 
queryranking_strategy = QueryEntityRanking(
    entity2description=entity2description,
    mention_freq=mention2freq,
    mention2pem=mention2pem,
    p_t_thetae_method = 'bayesian' #Smoothing method
    )
e_linker = EntityLinker(
    ranking_strategy=queryranking_strategy,
    entity2description=entity2description,
    ner_model=tagger,
    mention2pem=mention2pem,
    prune_overlapping_method='large_text',
    use_ner_dict=True

    )


In [9]:
def search():
    e_linker.get_mentions_by_tokens_and_dict(
        'a quaternary ammonium salt that is the monoiodide')


In [10]:
%%time
search()

CPU times: user 67 µs, sys: 330 µs, total: 397 µs
Wall time: 401 µs


In [8]:
%timeit search()


23 µs ± 230 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [60]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.link_entities(text)


In [61]:
samples

[{'end_pos': 26,
  'start_pos': 2,
  'text': 'quaternary ammonium salt',
  'entities': [['CHEBI:35273',
    117.02924314033649,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 117.02924314033649)}]

In [62]:

_print_colorful_text(text,samples)
# samples



a [41m[30mquaternary ammonium salt[0m that is the monoiodide



In [27]:
text = 'Altered gut microbiome composition in HIV infection: causes, effects and potential intervention.'
samples = e_linker.link_entities(text)
_print_colorful_text(text, samples)


terms_freq[term] 14
self.len_terms_collection 3506008
p_t_theta 3.9929857611275005e-06
p_t_Eps 3.9931454805579454e-06
terms_freq[term] 44
self.len_terms_collection 3506008
p_t_theta 1.2549383820686427e-05
p_t_Eps 1.2549885796039255e-05
terms_freq[term] 2
self.len_terms_collection 3506008
p_t_theta 1.0570026553303288e-05
p_t_Eps 5.704493543654208e-07
terms_freq[term] 42
self.len_terms_collection 3506008
p_t_theta 1.1978957283382502e-05
p_t_Eps 1.1979436441673835e-05
terms_freq[term] 3
self.len_terms_collection 3506008
p_t_theta 8.556398059558929e-07
p_t_Eps 8.556740315481311e-07
mention microbiome
P(e|m) 1.0
P(q|e) 1.338994983561869
terms_freq[term] 14
self.len_terms_collection 3506008
p_t_theta 1.3990767050159417e-05
p_t_Eps 3.9931454805579454e-06
terms_freq[term] 44
self.len_terms_collection 3506008
p_t_theta 1.2547752678083982e-05
p_t_Eps 1.2549885796039255e-05
terms_freq[term] 2
self.len_terms_collection 3506008
p_t_theta 5.703523944583629e-07
p_t_Eps 5.704493543654208e-07
terms_fre

In [64]:
import json

In [65]:
jsonStr = json.dumps(samples)
print(jsonStr)


[{"start_pos": 8, "end_pos": 11, "text": "gut", "entities": [["UBERON:0001007", 0.8700926733965282, "organs gastrointestinal tract devoted food residual digestion anatomical parts ingestion discharge assimilation digestive alimentary gut wastes system"], ["UBERON:0001555", 0.7340052681185107, "anus tract digestive mouth alimentary canal enteric extending gut tube"], ["UBERON:0004907", 0.36037230161509565, "gi gastrointestinal tract intestines definition region digestive lower beginning extending gut anus[go"], ["MA:0000917", 0.23101364089906892, "gut"], ["ZFA:0000112", 0.3708896865541044, "organs tract associated alimentary digestive canal enteric gut tube"]], "best_entity": ["UBERON:0001007", 0.8700926733965282]}, {"end_pos": 22, "text": "microbiome", "start_pos": 12, "entities": [["EFO:0004982", 3.4203408798620867, "microbiome microorganisms collectivity collection"]], "best_entity": ["EFO:0004982", 3.4203408798620867]}, {"end_pos": 34, "start_pos": 23, "text": "composition", "entiti

In [66]:
from dkoulinker.utils import create_html_entities

In [67]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.link_entities(text)
create_html_entities(text, samples)


'<div class="entities" style="line-height: 2.5 direction: ltr">a <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">quaternary ammonium salt <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">CHEBI:35273</span> </mark>\n     that is the monoiodide</div>'

In [68]:
samples

[{'end_pos': 26,
  'start_pos': 2,
  'text': 'quaternary ammonium salt',
  'entities': [['CHEBI:35273',
    117.02924314033649,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 117.02924314033649)}]

In [28]:
text='sexual reproduction'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)



sexual [41m[30mreproduction[0m



In [29]:
text='Quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)
samples


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



[{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19},
 {'text': 'salt', 'start_pos': 20, 'end_pos': 24}]

In [31]:
text = 'Quaternary ammonium salt that is the monoiodide. Sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions

mention salt
P(e|m) 0.375
P(q|e) 0.9999900001499975
mention salt
P(e|m) 158.0
P(q|e) 1.3237330406480714
mention ammonium salt
P(e|m) 1.0
P(q|e) 1.7524706693670942
mention ammonium
P(e|m) 1.0
P(q|e) 1.3237396584364176
mention reproduction
P(e|m) 1.0
P(q|e) 1.348232971990509

Quaternary [41m[30mammonium salt[0m that is the monoiodide. Sexual [42m[30mreproduction[0m



[{'end_pos': 24,
  'text': 'ammonium salt',
  'start_pos': 11,
  'entities': [['CHEBI:47704',
    1.7524706693670942,
    'ammoniumsalz ammoniumsalze ammonium salt salts']],
  'best_entity': ('CHEBI:47704', 1.7524706693670942)},
 {'start_pos': 56,
  'text': 'reproduction',
  'end_pos': 68,
  'entities': [['GO:0000003',
    1.348232971990509,
    'process inherited material genetic reproductive individuals contain production parent organisms reproduction new physiological portion']],
  'best_entity': ('GO:0000003', 1.348232971990509)}]

In [32]:
text='quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention reduced
P(e|m) 0.6
P(q|e) 1.2600611746437556
mention reduced
P(e|m) 0.4
P(q|e) 1.2600611746437556
mention ammonium
P(e|m) 1.0
P(q|e) 1.2055609573301036
mention salt
P(e|m) 0.375
P(q|e) 0.9999888890617254
mention salt
P(e|m) 158.0
P(q|e) 1.2055542606918772
mention species
P(e|m) 0.998766954377312
P(q|e) 0.9999944444876541
mention species
P(e|m) 0.0012330456226880395
P(q|e) 1.0023733720704264
mention species
P(e|m) 0.0006165228113440197
P(q|e) 1.0023733720704264
mention reproduction
P(e|m) 1.0
P(q|e) 1.220383851165765
mention quaternary ammonium salt
P(e|m) 1.0
P(q|e) 1.4534579639500007

[41m[30mquaternary ammonium salt[0m that is the monoiodide that can be found in some [42m[30mspecies[0m with sexual [43m[30mreproduction[0m [44m[30mreduced[0m



[{'end_pos': 24,
  'start_pos': 0,
  'text': 'quaternary ammonium salt',
  'entities': [['CHEBI:35273',
    1.4534579639500007,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 1.4534579639500007)},
 {'text': 'species',
  'end_pos': 81,
  'start_pos': 74,
  'entities': [['OBI:0100026', 0.9987614057151663, 'organism'],
   ['MCCV_000003', 0.0012359720987304887, 'species'],
   ['NCBITaxon:species', 0.0006179860493652444, 'species']],
  'best_entity': ('OBI:0100026', 0.9987614057151663)},
 {'start_pos': 94,
  'text': 'reproduction',
  'end_pos': 106,
  'entities': [['GO:0000003',
    1.220383851165765,
    'process inherited material genetic reproductive individuals contain production parent organisms reproduction new physiological portion']],
  'best_entity': ('GO:0000003', 1.220383851165765)},
 {'end_pos': 114,
  'start_pos': 107,
  'text': 'reduced',
  'e

In [34]:
text = 'quaternary ammonium salt that is the monoiodide that can be found in some species with reduced sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)


mention reproduction
score_e_q_m 1.220383851165765
mention ammonium
score_e_q_m 1.2055609573301036
mention salt
score_e_q_m 0.374995833398147
mention salt
score_e_q_m 190.4775731893166
mention species
score_e_q_m 0.9987614057151663
mention species
score_e_q_m 0.0012359720987304887
mention species
score_e_q_m 0.0006179860493652444
mention reduced
score_e_q_m 0.7560367047862533
mention reduced
score_e_q_m 0.5040244698575023
mention quaternary ammonium salt
score_e_q_m 1.4534579639500007

[41m[30mquaternary ammonium salt[0m that is the monoiodide that can be found in some [42m[30mspecies[0m with [43m[30mreduced[0m sexual [44m[30mreproduction[0m



In [89]:
text='A molecular entity having a net electric.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention net
P(e|m) 1.0
P(q|e) 6.664883535521932
mention entity
P(e|m) 0.9901477832512315
P(q|e) 4.529518130742791
mention entity
P(e|m) 0.4187192118226601
P(q|e) 4.529518130742791
mention molecular entity
P(e|m) 1.0
P(q|e) 2.667553377723604

A [41m[30mmolecular entity[0m having a [42m[30mnet[0m electric.



[{'text': 'molecular entity',
  'end_pos': 18,
  'start_pos': 2,
  'entities': [['CHEBI:23367',
    2.667553377723604,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 2.667553377723604)},
 {'start_pos': 28,
  'end_pos': 31,
  'text': 'net',
  'entities': [['GO:0140644',
    6.664883535521932,
    'granular structure microorganisms dna microbicidal composed entrapped nuclear net granule antimicrobial framework damage chromatin extracellular neutrophil associated proteins trap histones']],
  'best_entity': ('GO:0140644', 6.664883535521932)}]

In [75]:
text="there's a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



there's a [41m[30mmolecular entity[0m with electric charge



[{'text': 'molecular entity',
  'start_pos': 10,
  'end_pos': 26,
  'entities': [['CHEBI:23367',
    4.356820601798991,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 4.356820601798991)}]

In [76]:
text = "theres a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



theres a [41m[30mmolecular entity[0m with electric charge



[{'text': 'molecular entity',
  'end_pos': 25,
  'start_pos': 9,
  'entities': [['CHEBI:23367',
    3.245904530608999,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 3.245904530608999)}]

In [77]:
text = 'The fluoride salt of beryllium (+2 oxidation state). In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres. As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m). In the [44m[30msolid[0m [45m[30mstate[0m it exists as a [46m[30mglass[0m, with four-[41m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[42m[30mcoordinate[0m [43m[30mfluoride[0m centres. As a [44m[30mgas[0m it adopts a [45m[30mlinear[0m triatomic [46m[30mstructure[0m and in the [41m[30mliquid[0m [42m[30mstate[0m a [43m[30mfluctuating[0m tetrahedral [44m[30mstructure[0m. In [45m[30mprotein[0m crystallography it is used as a mimic of [46m[30mphosphate[0m.



[{'start_pos': 4,
  'end_pos': 17,
  'text': 'fluoride salt',
  'entities': [['CHEBI:24060',
    2.216991408264466,
    'fluoride salt salts fluorides']],
  'best_entity': ('CHEBI:24060', 2.216991408264466)},
 {'start_pos': 21,
  'text': 'beryllium',
  'end_pos': 30,
  'entities': [['CHEBI:33783',
    0.49010475837672285,
    'ben metallic be(0 metal beryllium(0 beryllium'],
   ['CHEBI:30501',
    0.9624891694868001,
    'alkaline earth 4be atomic atom 4 metal number beryllium berilio']],
  'best_entity': ('CHEBI:30501', 0.9624891694868001)},
 {'end_pos': 50,
  'text': 'state',
  'start_pos': 45,
  'entities': [['SIO:000662', 1.671060798158725, 'state']],
  'best_entity': ('SIO:000662', 1.671060798158725)},
 {'start_pos': 60,
  'text': 'solid',
  'end_pos': 65,
  'entities': [['PATO:0001736',
    1.4661171813441671,
    'configuration having quality arrangement virtue parts characteristics exhibits inhering solid solids physical bearer']],
  'best_entity': ('PATO:0001736', 1.4661171813

In [35]:
text = 'The fluoride salt of beryllium (+2 oxidation state)'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention beryllium
score_e_q_m 0.4387205048735838
mention beryllium
score_e_q_m 0.8774209557861625
mention fluoride salt
score_e_q_m 1.7120736865784845
mention salt
score_e_q_m 0.3749957143530601
mention salt
score_e_q_m 200.93183586668067
mention state
score_e_q_m 1.6688984143375813
mention fluoride
score_e_q_m 1.0769079632337613
mention fluoride
score_e_q_m 0.2692100706158081

The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m)



[{'start_pos': 4,
  'text': 'fluoride salt',
  'end_pos': 17,
  'entities': [['CHEBI:24060',
    1.7120736865784845,
    'fluoride salt salts fluorides']],
  'best_entity': ('CHEBI:24060', 1.7120736865784845)},
 {'start_pos': 21,
  'text': 'beryllium',
  'end_pos': 30,
  'entities': [['CHEBI:33783',
    0.4387205048735838,
    'ben metallic be(0 metal beryllium(0 beryllium'],
   ['CHEBI:30501',
    0.8774209557861625,
    'alkaline earth 4be atomic atom 4 metal number beryllium berilio']],
  'best_entity': ('CHEBI:30501', 0.8774209557861625)},
 {'end_pos': 50,
  'start_pos': 45,
  'text': 'state',
  'entities': [['SIO:000662', 1.6688984143375813, 'state']],
  'best_entity': ('SIO:000662', 1.6688984143375813)}]

In [36]:
text='In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention state
score_e_q_m 1.3853066100438254
mention coordinate
score_e_q_m 1.177051555061504
mention coordinate
score_e_q_m 1.177051555061504
mention fluoride
score_e_q_m 0.9665726788116387
mention fluoride
score_e_q_m 0.24163108935646674
mention glass
score_e_q_m 1.3852310552144478
mention solid
score_e_q_m 1.2597336855442363

In the [41m[30msolid[0m [42m[30mstate[0m it exists as a [43m[30mglass[0m, with four-[44m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[45m[30mcoordinate[0m [46m[30mfluoride[0m centres.



[{'start_pos': 7,
  'text': 'solid',
  'end_pos': 12,
  'entities': [['PATO:0001736',
    1.2597336855442363,
    'configuration having quality arrangement virtue parts characteristics exhibits inhering solid solids physical bearer']],
  'best_entity': ('PATO:0001736', 1.2597336855442363)},
 {'end_pos': 18,
  'text': 'state',
  'start_pos': 13,
  'entities': [['SIO:000662', 1.3853066100438254, 'state']],
  'best_entity': ('SIO:000662', 1.3853066100438254)},
 {'text': 'glass',
  'start_pos': 34,
  'end_pos': 39,
  'entities': [['CHEBI:131189',
    1.3852310552144478,
    'amorphous polymerous sodium transparent glass oxides brittle inorganic silicate basic usually hard potassium']],
  'best_entity': ('CHEBI:131189', 1.3852310552144478)},
 {'start_pos': 51,
  'end_pos': 61,
  'text': 'coordinate',
  'entities': [['SIO:000071', 1.177051555061504, 'coordinate']],
  'best_entity': ('SIO:000071', 1.177051555061504)},
 {'start_pos': 97,
  'text': 'coordinate',
  'end_pos': 107,
  'entities': 

In [37]:
text = ' As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention structure
score_e_q_m 1.01956804705477
mention structure
score_e_q_m 0.7283343892883459
mention structure
score_e_q_m 1.01956804705477
mention structure
score_e_q_m 0.7283343892883459
mention linear
score_e_q_m 1.190994665048961
mention fluctuating
score_e_q_m 1.3852826828001976
mention gas
score_e_q_m 0.6519118198573883
mention gas
score_e_q_m 0.6519260412677221
mention liquid
score_e_q_m 1.3852977946039937
mention state
score_e_q_m 1.3853053506810487

 As a [41m[30mgas[0m it adopts a [42m[30mlinear[0m triatomic [43m[30mstructure[0m and in the [44m[30mliquid[0m [45m[30mstate[0m a [46m[30mfluctuating[0m tetrahedral [41m[30mstructure[0m.



[{'text': 'gas',
  'end_pos': 9,
  'start_pos': 6,
  'entities': [['GO:0034005',
    0.6519118198573883,
    'reaction -(r)-germacrene farnesyl germacrene forming -(10r)-germacrene = gas catalysis synthase -germacrene diphosphate activity trans 2 + trans,6 lyase'],
   ['PATO:0001737',
    0.6519260412677221,
    'configuration having quality arrangement virtue parts characteristics exhibits gas inhering gaseus physical gases bearer']],
  'best_entity': ('PATO:0001737', 0.6519260412677221)},
 {'text': 'linear',
  'start_pos': 22,
  'end_pos': 28,
  'entities': [['PATO:0001199',
    1.190994665048961,
    'opposite quality narrow virtue inhering margins linear parallel shape bearer']],
  'best_entity': ('PATO:0001199', 1.190994665048961)},
 {'end_pos': 48,
  'start_pos': 39,
  'text': 'structure',
  'entities': [['PATO:0000141',
    1.01956804705477,
    'structure morphology quality form arrangements connectivity virtue relative relational position organism parts inhering pattern confor

In [81]:
text = 'In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



In [41m[30mprotein[0m crystallography it is used as a mimic of [42m[30mphosphate[0m.



[{'end_pos': 10,
  'text': 'protein',
  'start_pos': 3,
  'entities': [['CHEBI:16541',
    1.3167385107698184,
    'chains synthesized occurring naturally protein chain ribosome polypeptide'],
   ['CHEBI:36080',
    74.92643092660032,
    'biological synthesized proteins protein macromolecule consisting chain minimally ribosome polypeptide'],
   ['PR:000000001', 769.3784359599088, 'protein'],
   ['SIO:010043', 1.9957936081969099, 'protein']],
  'best_entity': ('PR:000000001', 769.3784359599088)},
 {'end_pos': 61,
  'text': 'phosphate',
  'start_pos': 52,
  'entities': [['CHEBI:35780',
    2.2874678944170217,
    'oxoanion base conjugate phosphorus ion pi phosphoric phosphate ions acid'],
   ['CHEBI:18367',
    1.2603223872408613,
    'hydrogenphosphate base conjugate tetraoxidophosphate(3- po4](3- tetraoxophosphate(v orthophosphate ion phosphate(3- po4(3- tetraoxophosphate(3- phosphate'],
   ['CHEBI:43474',
    0.594929892066507,
    'hydrogenphosphate base hydrogentetraoxophosphate(v 

In [7]:
text = """The process in which an antigen-presenting cell expresses peptide antigen in 
association with an MHC protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the MHC protein complex.
 The peptide antigen is typically, but not always, processed from an endogenous or exogenous protein."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention transport
P(e|m) 1.0
P(q|e) 2.182631603528351
mention antigen
P(e|m) 0.8333333333333334
P(q|e) 2.7606446447774666
mention antigen
P(e|m) 0.16666666666666666
P(q|e) 1.6543420013607473
mention proteolysis
P(e|m) 1.0
P(q|e) 1.4705493589369527
mention complex
P(e|m) 1.0
P(q|e) 1.5906938289863541
mention complex
P(e|m) 1.0
P(q|e) 1.5906938289863541
mention antigen
P(e|m) 0.8333333333333334
P(q|e) 2.7606446447774666
mention antigen
P(e|m) 0.16666666666666666
P(q|e) 1.6543420013607473
mention protein
P(e|m) 0.18181818181818182
P(q|e) 1.485841367464235
mention protein
P(e|m) 10.818181818181818
P(q|e) 1.4726385122522343
mention protein
P(e|m) 70.0909090909091
P(q|e) 1.6147141957126674
mention protein
P(e|m) 0.18181818181818182
P(q|e) 1.6147141957126674
mention protein
P(e|m) 0.18181818181818182
P(q|e) 1.485841367464235
mention protein
P(e|m) 10.818181818181818
P(q|e) 1.4726385122522343
mention protein
P(e|m) 70.0909090909091
P(q|e) 1.6147141957126674
mention protein
P(e|m) 0.18181818181

[{'text': 'process',
  'start_pos': 4,
  'end_pos': 11,
  'entities': [['BFO:0000015',
    1.2460255469628874,
    'depends_on p t. material t occurrent parts proper temporal entity time s process'],
   ['UBERON:0004529',
    0.08467235649729467,
    'larger papilla processes anatomical lamella organ spine shelf flanges outgrowth processus flange body lamellae lamina tissue shelves ridges protrusion process projection projections ridge laminae'],
   ['ProcessualEntity', 0.046147671421455524, 'process'],
   ['SIO:000006', 0.5114700249211321, 'process'],
   ['ZFA:0001637',
    0.017610971339665847,
    'bone process extension tissue bony portion connective projection projections']],
  'best_entity': ('BFO:0000015', 1.2460255469628874)},
 {'end_pos': 31,
  'text': 'antigen',
  'start_pos': 24,
  'entities': [['CHEBI:59132',
    2.3005372039812224,
    'cell response antigen receptor histocompability antigens complex mhc antibody t stimulates presentation substance production major binding

In [38]:
text = 'A tiger is a carnovire of the mammal family.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention mammal
score_e_q_m 0.9999940000479997
mention family
score_e_q_m 0.5540460053084098
mention family
score_e_q_m 0.5540460053084098
mention family
score_e_q_m 1.1080122364480558
mention tiger
score_e_q_m 1.4001970292471357

A [41m[30mtiger[0m is a carnovire of the [42m[30mmammal[0m [43m[30mfamily[0m.



[{'text': 'tiger',
  'end_pos': 7,
  'start_pos': 2,
  'entities': [['NCBITaxon:9694',
    1.4001970292471357,
    'panthera tiger tigris']],
  'best_entity': ('NCBITaxon:9694', 1.4001970292471357)},
 {'end_pos': 36,
  'text': 'mammal',
  'start_pos': 30,
  'entities': [['NCBITaxon:40674', 0.9999940000479997, 'mammalia']],
  'best_entity': ('NCBITaxon:40674', 0.9999940000479997)},
 {'start_pos': 37,
  'end_pos': 43,
  'text': 'family',
  'entities': [['NCBITaxon:family', 0.5540460053084098, 'family'],
   ['SIO:001063', 0.5540460053084098, 'family'],
   ['STATO:0000257',
    1.1080122364480558,
    'common linked ancestor number demonstrated group stipulated groups descent adoption family domestic marriage']],
  'best_entity': ('STATO:0000257', 1.1080122364480558)}]

In [84]:

text = """Natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves. 
However, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide. 
The conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years. 
One of the most challenging processes of high industrial importance is the conversion of methane to methanol (CHEBI:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2]."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



Natural [41m[30mgas[0m consists primarily of [42m[30mmethane[0m that is clean, non-[43m[30mtoxic[0m, and has abundant natural reserves. 
However, [44m[30mmethane[0m is also a greenhouse [45m[30mgas[0m whose greenhouse effect is more than 20 times than that of [46m[30mcarbon[0m dioxide. 
The conversion of [41m[30mmethane[0m into other value-added chemicals has been an important research [42m[30marea[0m in the [43m[30mfield[0m of catalysis for many years. 
One of the most challenging [44m[30mprocesses[0m of high industrial importance is the conversion of [45m[30mmethane[0m to [46m[30mmethanol[0m (CHEBI:17790), 
a [41m[30msimple[0m [42m[30malcohol[0m that is [43m[30mliquid[0m under ambient conditions and can be easily stored and transported compared to [44m[30mmethane[0m [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2].



[{'end_pos': 11,
  'text': 'gas',
  'start_pos': 8,
  'entities': [['GO:0034005',
    0.6022665026762707,
    'reaction -(r)-germacrene farnesyl germacrene forming -(10r)-germacrene = gas catalysis synthase -germacrene diphosphate activity trans 2 + trans,6 lyase'],
   ['PATO:0001737',
    0.604881766020379,
    'configuration having quality arrangement virtue parts characteristics exhibits gas inhering gaseus physical gases bearer']],
  'best_entity': ('PATO:0001737', 0.604881766020379)},
 {'text': 'methane',
  'start_pos': 34,
  'end_pos': 41,
  'entities': [['CHEBI:16183',
    1.9526549278311631,
    'metano single ch4 methan odourless non toxic flammable marsh atoms colourless b.p bonds gas attached carbon methane -161degreec methyl compound tetrahydridocarbon hydrogen hydride']],
  'best_entity': ('CHEBI:16183', 1.9526549278311631)},
 {'start_pos': 61,
  'text': 'toxic',
  'end_pos': 66,
  'entities': [['SIO:001009', 1.2813017861774334, 'toxic']],
  'best_entity': ('SIO:001009', 1

In [21]:
text = "Altered gut microbiome composition in HIV infection: causes, effects and potential intervention."
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)


terms_freq[term] 14
self.len_terms_collection 3506008
p_t_theta 3.9929857611275005e-06
p_t_Eps 3.9931454805579454e-06
terms_freq[term] 44
self.len_terms_collection 3506008
p_t_theta 1.2549383820686427e-05
p_t_Eps 1.2549885796039255e-05
terms_freq[term] 2
self.len_terms_collection 3506008
p_t_theta 1.0570026553303288e-05
p_t_Eps 5.704493543654208e-07
terms_freq[term] 42
self.len_terms_collection 3506008
p_t_theta 1.1978957283382502e-05
p_t_Eps 1.1979436441673835e-05
terms_freq[term] 3
self.len_terms_collection 3506008
p_t_theta 8.556398059558929e-07
p_t_Eps 8.556740315481311e-07
mention microbiome
P(e|m) 1.0
P(q|e) 1.338994983561869
terms_freq[term] 14
self.len_terms_collection 3506008
p_t_theta 1.3990767050159417e-05
p_t_Eps 3.9931454805579454e-06
terms_freq[term] 44
self.len_terms_collection 3506008
p_t_theta 1.2547752678083982e-05
p_t_Eps 1.2549885796039255e-05
terms_freq[term] 2
self.len_terms_collection 3506008
p_t_theta 5.703523944583629e-07
p_t_Eps 5.704493543654208e-07
terms_fre

In [40]:
text = "Batman advices to follow your gut when you eat food"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)


mention gut
score_e_q_m 0.7159732384208091
mention gut
score_e_q_m 0.3521147213636813
mention gut
score_e_q_m 0.17605618709910878
mention gut
score_e_q_m 0.08803132109570196
mention gut
score_e_q_m 0.17605794748494133
mention food
score_e_q_m 1.6267142776939445

Batman advices to follow your [41m[30mgut[0m when you eat [42m[30mfood[0m

[{'start_pos': 30, 'end_pos': 33, 'text': 'gut', 'entities': [['UBERON:0001007', 0.7159732384208091, 'organs gastrointestinal tract devoted food residual digestion anatomical parts ingestion discharge assimilation digestive alimentary gut wastes system'], ['UBERON:0001555', 0.3521147213636813, 'anus tract digestive mouth alimentary canal enteric extending gut tube'], ['UBERON:0004907', 0.17605618709910878, 'gi gastrointestinal tract intestines definition region digestive lower beginning extending gut anus[go'], ['MA:0000917', 0.08803132109570196, 'gut'], ['ZFA:0000112', 0.17605794748494133, 'organs tract associated alimentary digestive canal enteri

In [41]:
text = "There woulds nos evicence to support the claim"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)


Failed to identify entity from text:

There woulds nos evicence to support the claim

[]
