In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
"""Computing the score 
    score(e;q,m) = P (e|q,m) ∝ P (e|m)P (q|e) 
"""
from termcolor import colored
from tabulate import tabulate
import pickle5 as pickle

import time
from dkoulinker.entity_linker import EntityLinker, get_mentions_ner
from flair.models import SequenceTagger
from dkoulinker.entity_ranking import DictionaryRanking, QueryEntityRanking
from dkoulinker.utils import _print_colorful_text


In [3]:
#loading dicitonary of commonness,
print('Loading mention2pem dictionary ...')
handle = open('../data/pem/pem.pickle', 'rb')
mention2pem = pickle.load(handle)


print('Loading entity description dictionary ...')
handle_desc = open('../data/pem/entity2description.pickle', 'rb')
entity2description = pickle.load(handle_desc)
print('NUmber of entities: ', len(entity2description))

print('Loading dictionary of term frequency ...')
handle_desc = open('../data/pem/mention_freq.pickle', 'rb')
mention2freq = pickle.load(handle_desc)
print('Number of term in the collection: ', len(mention2freq))

#given by create_term_req
collection_size_terms = len(mention2pem)


Loading mention2pem dictionary ...
Loading entity description dictionary ...
NUmber of entities:  2680002
Loading dictionary of term frequency ...
Number of term in the collection:  3506008


In [4]:
list(mention2pem['reproduction'].keys())

['GO:0000003']

In [5]:
list(mention2pem['reproduction'].values())


[1.0]

In [6]:
entity2description['GO:0005578']


'#UNK'

In [7]:
entity2description['GO:0031012']


('structure',
 'cells',
 'biochemical',
 'tissues',
 'biomechanical',
 'lying',
 'proteinaceous',
 'structural',
 'extracellular',
 'support',
 'matrisome',
 'cues',
 'matrix',
 'external',
 'provides')

In [4]:
# load the NER tagger
tagger = SequenceTagger.load(
    '../resources/taggers/sota-ner-flair/best-model.pt')


2021-12-14 16:40:33,002 loading file ../resources/taggers/sota-ner-flair/best-model.pt


In [6]:
text='quaternary ammonium salt that is the monoiodide. sexual reproduction'
get_mentions_ner(text,tagger)

(['quaternary ammonium salt'],
 [{'text': 'quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}])

In [5]:
dictionarysearch_strategy = DictionaryRanking(mention2pem) 
queryranking_strategy = QueryEntityRanking(
    entity2description=entity2description,
    mention_freq=mention2freq,
    mention2pem=mention2pem,
    p_t_thetae_method = 'bayesian' #Smoothing method
    )
e_linker = EntityLinker(
    ranking_strategy=queryranking_strategy,
    entity2description=entity2description,
    ner_model=tagger,
    mention2pem=mention2pem,
    prune_overlapping_method='large_text',
    use_ner_dict=True

    )


In [9]:
def search():
    e_linker.get_mentions_by_tokens_and_dict(
        'a quaternary ammonium salt that is the monoiodide')


In [10]:
%%time
search()

CPU times: user 67 µs, sys: 330 µs, total: 397 µs
Wall time: 401 µs


In [8]:
%timeit search()


23 µs ± 230 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [12]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.link_entities(text)


In [13]:
samples

[{'end_pos': 26,
  'start_pos': 2,
  'text': 'quaternary ammonium salt',
  'entities': [['CHEBI:35273',
    2.3198381271481447,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 2.3198381271481447)}]

In [14]:

_print_colorful_text(text,samples)
# samples



a [41m[30mquaternary ammonium salt[0m that is the monoiodide



In [15]:
text = 'Altered gut microbiome composition in HIV infection: causes, effects and potential intervention.'
samples = e_linker.link_entities(text)
_print_colorful_text(text, samples)



Altered [41m[30mgut[0m [42m[30mmicrobiome[0m [43m[30mcomposition[0m in HIV [44m[30minfection[0m: causes, effects and potential [45m[30mintervention[0m.



In [64]:
import json

In [65]:
jsonStr = json.dumps(samples)
print(jsonStr)


[{"start_pos": 8, "end_pos": 11, "text": "gut", "entities": [["UBERON:0001007", 0.8700926733965282, "organs gastrointestinal tract devoted food residual digestion anatomical parts ingestion discharge assimilation digestive alimentary gut wastes system"], ["UBERON:0001555", 0.7340052681185107, "anus tract digestive mouth alimentary canal enteric extending gut tube"], ["UBERON:0004907", 0.36037230161509565, "gi gastrointestinal tract intestines definition region digestive lower beginning extending gut anus[go"], ["MA:0000917", 0.23101364089906892, "gut"], ["ZFA:0000112", 0.3708896865541044, "organs tract associated alimentary digestive canal enteric gut tube"]], "best_entity": ["UBERON:0001007", 0.8700926733965282]}, {"end_pos": 22, "text": "microbiome", "start_pos": 12, "entities": [["EFO:0004982", 3.4203408798620867, "microbiome microorganisms collectivity collection"]], "best_entity": ["EFO:0004982", 3.4203408798620867]}, {"end_pos": 34, "start_pos": 23, "text": "composition", "entiti

In [66]:
from dkoulinker.utils import create_html_entities

In [67]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.link_entities(text)
create_html_entities(text, samples)


'<div class="entities" style="line-height: 2.5 direction: ltr">a <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">quaternary ammonium salt <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">CHEBI:35273</span> </mark>\n     that is the monoiodide</div>'

In [68]:
samples

[{'end_pos': 26,
  'start_pos': 2,
  'text': 'quaternary ammonium salt',
  'entities': [['CHEBI:35273',
    117.02924314033649,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 117.02924314033649)}]

In [16]:
text='sexual reproduction'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)



sexual [41m[30mreproduction[0m



In [17]:
text='Quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)
samples


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



[{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19},
 {'text': 'salt', 'start_pos': 20, 'end_pos': 24}]

In [18]:
text = 'Quaternary ammonium salt that is the monoiodide. Sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide. Sexual [43m[30mreproduction[0m



[{'start_pos': 11,
  'text': 'ammonium',
  'end_pos': 19,
  'entities': [['CHEBI:28938',
    1.3237396584364176,
    'protonation ammonia + nh4(+ nh4 ammonium ammonium(1 onium nh4](+ ion azanium cation obtained']],
  'best_entity': ('CHEBI:28938', 1.3237396584364176)},
 {'text': 'salt',
  'start_pos': 20,
  'end_pos': 24,
  'entities': [['CHEBI:26710', 0.37499625005624904, 'sodium chloride'],
   ['CHEBI:24866',
    209.1498204223953,
    'sel sels salt anions ionic salze salts compounds cations assembly compound sal salz sales']],
  'best_entity': ('CHEBI:24866', 209.1498204223953)},
 {'start_pos': 56,
  'end_pos': 68,
  'text': 'reproduction',
  'entities': [['GO:0000003',
    1.348232971990509,
    'process inherited material genetic reproductive individuals contain production parent organisms reproduction new physiological portion']],
  'best_entity': ('GO:0000003', 1.348232971990509)}]

In [19]:
text='quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



[41m[30mquaternary ammonium salt[0m that is the monoiodide that can be found in some [42m[30mspecies[0m with sexual [43m[30mreproduction[0m [44m[30mreduced[0m



[{'start_pos': 0,
  'text': 'quaternary ammonium salt',
  'end_pos': 24,
  'entities': [['CHEBI:35273',
    1.4534579639500007,
    'hydrogens nitrogen salt bonded quaternary ammonium salts nh4(+))y(- univalent organyl compounds replaced usually groups compound derivatives']],
  'best_entity': ('CHEBI:35273', 1.4534579639500007)},
 {'start_pos': 74,
  'end_pos': 81,
  'text': 'species',
  'entities': [['OBI:0100026', 0.9987614057151663, 'organism'],
   ['MCCV_000003', 0.0012359720987304887, 'species'],
   ['NCBITaxon:species', 0.0006179860493652444, 'species']],
  'best_entity': ('OBI:0100026', 0.9987614057151663)},
 {'end_pos': 106,
  'text': 'reproduction',
  'start_pos': 94,
  'entities': [['GO:0000003',
    1.2203838511657648,
    'process inherited material genetic reproductive individuals contain production parent organisms reproduction new physiological portion']],
  'best_entity': ('GO:0000003', 1.2203838511657648)},
 {'start_pos': 107,
  'text': 'reduced',
  'end_pos': 114,
  

In [20]:
text = 'quaternary ammonium salt that is the monoiodide that can be found in some species with reduced sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)



[41m[30mquaternary ammonium salt[0m that is the monoiodide that can be found in some [42m[30mspecies[0m with [43m[30mreduced[0m sexual [44m[30mreproduction[0m



In [89]:
text='A molecular entity having a net electric.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


mention net
P(e|m) 1.0
P(q|e) 6.664883535521932
mention entity
P(e|m) 0.9901477832512315
P(q|e) 4.529518130742791
mention entity
P(e|m) 0.4187192118226601
P(q|e) 4.529518130742791
mention molecular entity
P(e|m) 1.0
P(q|e) 2.667553377723604

A [41m[30mmolecular entity[0m having a [42m[30mnet[0m electric.



[{'text': 'molecular entity',
  'end_pos': 18,
  'start_pos': 2,
  'entities': [['CHEBI:23367',
    2.667553377723604,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 2.667553377723604)},
 {'start_pos': 28,
  'end_pos': 31,
  'text': 'net',
  'entities': [['GO:0140644',
    6.664883535521932,
    'granular structure microorganisms dna microbicidal composed entrapped nuclear net granule antimicrobial framework damage chromatin extracellular neutrophil associated proteins trap histones']],
  'best_entity': ('GO:0140644', 6.664883535521932)}]

In [21]:
text="there's a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



there's a [41m[30mmolecular entity[0m with electric charge



[{'end_pos': 26,
  'start_pos': 10,
  'text': 'molecular entity',
  'entities': [['CHEBI:23367',
    1.0405706682529128,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 1.0405706682529128)}]

In [22]:
text = "theres a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



theres a [41m[30mmolecular entity[0m with electric charge



[{'end_pos': 25,
  'start_pos': 9,
  'text': 'molecular entity',
  'entities': [['CHEBI:23367',
    1.0322774021456191,
    'separately entite complex entities entidades pair entidad moleculaire radical entitaet molekulare distinct constitutionally distinguishable entity molecule isotopically molecular identifiable atom conformer ion etc moleculares']],
  'best_entity': ('CHEBI:23367', 1.0322774021456191)}]

In [23]:
text = 'The fluoride salt of beryllium (+2 oxidation state). In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres. As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m). In the [44m[30msolid[0m [45m[30mstate[0m it exists as a [46m[30mglass[0m, with four-[41m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[42m[30mcoordinate[0m [43m[30mfluoride[0m centres. As a [44m[30mgas[0m it adopts a [45m[30mlinear[0m triatomic [46m[30mstructure[0m and in the [41m[30mliquid[0m [42m[30mstate[0m a [43m[30mfluctuating[0m tetrahedral [44m[30mstructure[0m. In [45m[30mprotein[0m crystallography it is used as a mimic of [46m[30mphosphate[0m.



[{'text': 'fluoride salt',
  'end_pos': 17,
  'start_pos': 4,
  'entities': [['CHEBI:24060',
    1.143864044958866,
    'fluoride salt salts fluorides']],
  'best_entity': ('CHEBI:24060', 1.143864044958866)},
 {'text': 'beryllium',
  'start_pos': 21,
  'end_pos': 30,
  'entities': [['CHEBI:33783',
    0.3570234448946624,
    'ben metallic be(0 metal beryllium(0 beryllium'],
   ['CHEBI:30501',
    0.7140326101366956,
    'alkaline earth 4be atomic atom 4 metal number beryllium berilio']],
  'best_entity': ('CHEBI:30501', 0.7140326101366956)},
 {'start_pos': 45,
  'text': 'state',
  'end_pos': 50,
  'entities': [['SIO:000662', 1.1365954796019202, 'state']],
  'best_entity': ('SIO:000662', 1.1365954796019202)},
 {'end_pos': 65,
  'text': 'solid',
  'start_pos': 60,
  'entities': [['PATO:0001736',
    1.0949066013404958,
    'configuration having quality arrangement virtue parts characteristics exhibits inhering solid solids physical bearer']],
  'best_entity': ('PATO:0001736', 1.094906601

In [24]:
text = 'The fluoride salt of beryllium (+2 oxidation state)'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mfluoride salt[0m of [42m[30mberyllium[0m (+2 oxidation [43m[30mstate[0m)



[{'text': 'fluoride salt',
  'end_pos': 17,
  'start_pos': 4,
  'entities': [['CHEBI:24060',
    1.7120736865784845,
    'fluoride salt salts fluorides']],
  'best_entity': ('CHEBI:24060', 1.7120736865784845)},
 {'text': 'beryllium',
  'start_pos': 21,
  'end_pos': 30,
  'entities': [['CHEBI:33783',
    0.4387205048735839,
    'ben metallic be(0 metal beryllium(0 beryllium'],
   ['CHEBI:30501',
    0.8774209557861625,
    'alkaline earth 4be atomic atom 4 metal number beryllium berilio']],
  'best_entity': ('CHEBI:30501', 0.8774209557861625)},
 {'start_pos': 45,
  'text': 'state',
  'end_pos': 50,
  'entities': [['SIO:000662', 1.6688984143375813, 'state']],
  'best_entity': ('SIO:000662', 1.6688984143375813)}]

In [25]:
text='In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



In the [41m[30msolid[0m [42m[30mstate[0m it exists as a [43m[30mglass[0m, with four-[44m[30mcoordinate[0m Be(2+) tetrahedral centres and two-[45m[30mcoordinate[0m [46m[30mfluoride[0m centres.



[{'text': 'solid',
  'end_pos': 12,
  'start_pos': 7,
  'entities': [['PATO:0001736',
    1.259733685544236,
    'configuration having quality arrangement virtue parts characteristics exhibits inhering solid solids physical bearer']],
  'best_entity': ('PATO:0001736', 1.259733685544236)},
 {'text': 'state',
  'start_pos': 13,
  'end_pos': 18,
  'entities': [['SIO:000662', 1.3853066100438254, 'state']],
  'best_entity': ('SIO:000662', 1.3853066100438254)},
 {'text': 'glass',
  'end_pos': 39,
  'start_pos': 34,
  'entities': [['CHEBI:131189',
    1.3852310552144478,
    'amorphous polymerous sodium transparent glass oxides brittle inorganic silicate basic usually hard potassium']],
  'best_entity': ('CHEBI:131189', 1.3852310552144478)},
 {'text': 'coordinate',
  'end_pos': 61,
  'start_pos': 51,
  'entities': [['SIO:000071', 1.1770515550615042, 'coordinate']],
  'best_entity': ('SIO:000071', 1.1770515550615042)},
 {'text': 'coordinate',
  'start_pos': 97,
  'end_pos': 107,
  'entities': 

In [26]:
text = ' As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



 As a [41m[30mgas[0m it adopts a [42m[30mlinear[0m triatomic [43m[30mstructure[0m and in the [44m[30mliquid[0m [45m[30mstate[0m a [46m[30mfluctuating[0m tetrahedral [41m[30mstructure[0m.



[{'start_pos': 6,
  'text': 'gas',
  'end_pos': 9,
  'entities': [['GO:0034005',
    0.6519118198573883,
    'reaction -(r)-germacrene farnesyl germacrene forming -(10r)-germacrene = gas catalysis synthase -germacrene diphosphate activity trans 2 + trans,6 lyase'],
   ['PATO:0001737',
    0.651926041267722,
    'configuration having quality arrangement virtue parts characteristics exhibits gas inhering gaseus physical gases bearer']],
  'best_entity': ('PATO:0001737', 0.651926041267722)},
 {'text': 'linear',
  'end_pos': 28,
  'start_pos': 22,
  'entities': [['PATO:0001199',
    1.1909946650489613,
    'opposite quality narrow virtue inhering margins linear parallel shape bearer']],
  'best_entity': ('PATO:0001199', 1.1909946650489613)},
 {'text': 'structure',
  'end_pos': 48,
  'start_pos': 39,
  'entities': [['PATO:0000141',
    1.01956804705477,
    'structure morphology quality form arrangements connectivity virtue relative relational position organism parts inhering pattern confor

In [27]:
text = 'In protein crystallography it is used as a mimic of phosphate.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



In [41m[30mprotein[0m crystallography it is used as a mimic of [42m[30mphosphate[0m.



[{'end_pos': 10,
  'start_pos': 3,
  'text': 'protein',
  'entities': [['CHEBI:16541',
    0.24210837578107897,
    'chains synthesized occurring naturally protein chain ribosome polypeptide'],
   ['CHEBI:36080',
    14.405333126219205,
    'biological synthesized proteins protein macromolecule consisting chain minimally ribosome polypeptide'],
   ['PR:000000001', 93.33539210040469, 'protein'],
   ['SIO:010043', 0.24211515460545963, 'protein']],
  'best_entity': ('PR:000000001', 93.33539210040469)},
 {'text': 'phosphate',
  'start_pos': 52,
  'end_pos': 61,
  'entities': [['CHEBI:35780',
    0.4541001912081851,
    'oxoanion base conjugate phosphorus ion pi phosphoric phosphate ions acid'],
   ['CHEBI:18367',
    0.25948374789757,
    'hydrogenphosphate base conjugate tetraoxidophosphate(3- po4](3- tetraoxophosphate(v orthophosphate ion phosphate(3- po4(3- tetraoxophosphate(3- phosphate'],
   ['CHEBI:43474',
    0.12973979838598487,
    'hydrogenphosphate base hydrogentetraoxophosphate

In [28]:
text = """The process in which an antigen-presenting cell expresses peptide antigen in 
association with an MHC protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the MHC protein complex.
 The peptide antigen is typically, but not always, processed from an endogenous or exogenous protein."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



The [41m[30mprocess[0m in which an [42m[30mantigen[0m-presenting [43m[30mcell[0m expresses [44m[30mpeptide[0m [45m[30mantigen[0m in 
[46m[30massociation[0m with an MHC [41m[30mprotein[0m [42m[30mcomplex[0m on its [43m[30mcell[0m [44m[30msurface[0m, including [45m[30mproteolysis[0m and 
[46m[30mtransport[0m steps for the [41m[30mpeptide[0m [42m[30mantigen[0m both prior to and following assembly with the MHC [43m[30mprotein[0m [44m[30mcomplex[0m.
 The[45m[30m peptide antige[0mn is typically, but not always, processed from an endogenous or exogenous [46m[30mprotein[0m.



[{'text': 'process',
  'end_pos': 11,
  'start_pos': 4,
  'entities': [['BFO:0000015',
    0.9871253304300318,
    'depends_on p t. material t occurrent parts proper temporal entity time s process'],
   ['UBERON:0004529',
    0.06873829428303153,
    'larger papilla processes anatomical lamella organ spine shelf flanges outgrowth processus flange body lamellae lamina tissue shelves ridges protrusion process projection projections ridge laminae'],
   ['ProcessualEntity', 0.03299741649915768, 'process'],
   ['SIO:000006', 0.3657213661989977, 'process'],
   ['ZFA:0001637',
    0.013748483604798002,
    'bone process extension tissue bony portion connective projection projections']],
  'best_entity': ('BFO:0000015', 0.9871253304300318)},
 {'end_pos': 31,
  'text': 'antigen',
  'start_pos': 24,
  'entities': [['CHEBI:59132',
    1.0132795614515877,
    'cell response antigen receptor histocompability antigens complex mhc antibody t stimulates presentation substance production major binding 

In [29]:
text = 'A tiger is a carnovire of the mammal family.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



A [41m[30mtiger[0m is a carnovire of the [42m[30mmammal[0m [43m[30mfamily[0m.



[{'text': 'tiger',
  'start_pos': 2,
  'end_pos': 7,
  'entities': [['NCBITaxon:9694',
    1.4001970292471355,
    'panthera tiger tigris']],
  'best_entity': ('NCBITaxon:9694', 1.4001970292471355)},
 {'start_pos': 30,
  'text': 'mammal',
  'end_pos': 36,
  'entities': [['NCBITaxon:40674', 0.9999940000479997, 'mammalia']],
  'best_entity': ('NCBITaxon:40674', 0.9999940000479997)},
 {'end_pos': 43,
  'start_pos': 37,
  'text': 'family',
  'entities': [['NCBITaxon:family', 0.5540460053084098, 'family'],
   ['SIO:001063', 0.5540460053084098, 'family'],
   ['STATO:0000257',
    1.1080122364480558,
    'common linked ancestor number demonstrated group stipulated groups descent adoption family domestic marriage']],
  'best_entity': ('STATO:0000257', 1.1080122364480558)}]

In [30]:

text = """Natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves. 
However, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide. 
The conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years. 
One of the most challenging processes of high industrial importance is the conversion of methane to methanol (CHEBI:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2]."""

mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



Natural [41m[30mgas[0m consists primarily of [42m[30mmethane[0m that is clean, non-[43m[30mtoxic[0m, and has abundant natural reserves. 
However, [44m[30mmethane[0m is also a greenhouse [45m[30mgas[0m whose greenhouse effect is more than 20 times than that of [46m[30mcarbon[0m dioxide. 
The conversion of [41m[30mmethane[0m into other value-added chemicals has been an important research [42m[30marea[0m in the [43m[30mfield[0m of catalysis for many years. 
One of the most challenging [44m[30mprocesses[0m of high industrial importance is the conversion of [45m[30mmethane[0m to [46m[30mmethanol[0m (CHEBI:17790), 
a [41m[30msimple[0m [42m[30malcohol[0m that is [43m[30mliquid[0m under ambient conditions and can be easily stored and transported compared to [44m[30mmethane[0m [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2].



[{'start_pos': 8,
  'text': 'gas',
  'end_pos': 11,
  'entities': [['GO:0034005',
    0.5257933837775433,
    'reaction -(r)-germacrene farnesyl germacrene forming -(10r)-germacrene = gas catalysis synthase -germacrene diphosphate activity trans 2 + trans,6 lyase'],
   ['PATO:0001737',
    0.5257973719319609,
    'configuration having quality arrangement virtue parts characteristics exhibits gas inhering gaseus physical gases bearer']],
  'best_entity': ('PATO:0001737', 0.5257973719319609)},
 {'start_pos': 34,
  'text': 'methane',
  'end_pos': 41,
  'entities': [['CHEBI:16183',
    1.170687657091637,
    'metano single ch4 methan odourless non toxic flammable marsh atoms colourless b.p bonds gas attached carbon methane -161degreec methyl compound tetrahydridocarbon hydrogen hydride']],
  'best_entity': ('CHEBI:16183', 1.170687657091637)},
 {'text': 'toxic',
  'end_pos': 66,
  'start_pos': 61,
  'entities': [['SIO:001009', 1.0637619798059499, 'toxic']],
  'best_entity': ('SIO:001009', 1

In [31]:
text = "Altered gut microbiome composition in HIV infection: causes, effects and potential intervention."
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)



Altered [41m[30mgut[0m [42m[30mmicrobiome[0m [43m[30mcomposition[0m in HIV [44m[30minfection[0m: causes, effects and potential [45m[30mintervention[0m.

[{'end_pos': 11, 'start_pos': 8, 'text': 'gut', 'entities': [['UBERON:0001007', 0.4048229062718224, 'organs gastrointestinal tract devoted food residual digestion anatomical parts ingestion discharge assimilation digestive alimentary gut wastes system'], ['UBERON:0001555', 0.3238696587271261, 'anus tract digestive mouth alimentary canal enteric extending gut tube'], ['UBERON:0004907', 0.161933210201473, 'gi gastrointestinal tract intestines definition region digestive lower beginning extending gut anus[go'], ['MA:0000917', 0.08097105809703307, 'gut'], ['ZFA:0000112', 0.16193563896282193, 'organs tract associated alimentary digestive canal enteric gut tube']], 'best_entity': ('UBERON:0001007', 0.4048229062718224)}, {'text': 'microbiome', 'start_pos': 12, 'end_pos': 22, 'entities': [['EFO:0004982', 1.338994983561869, 'mic

In [32]:
text = "Batman advices to follow your gut when you eat food"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)



Batman advices to follow your [41m[30mgut[0m when you eat [42m[30mfood[0m

[{'end_pos': 33, 'start_pos': 30, 'text': 'gut', 'entities': [['UBERON:0001007', 0.7159732384208091, 'organs gastrointestinal tract devoted food residual digestion anatomical parts ingestion discharge assimilation digestive alimentary gut wastes system'], ['UBERON:0001555', 0.3521147213636813, 'anus tract digestive mouth alimentary canal enteric extending gut tube'], ['UBERON:0004907', 0.17605618709910878, 'gi gastrointestinal tract intestines definition region digestive lower beginning extending gut anus[go'], ['MA:0000917', 0.08803132109570196, 'gut'], ['ZFA:0000112', 0.17605794748494133, 'organs tract associated alimentary digestive canal enteric gut tube']], 'best_entity': ('UBERON:0001007', 0.7159732384208091)}, {'text': 'food', 'start_pos': 47, 'end_pos': 51, 'entities': [['CHEBI:33290', 1.6267142776939445, 'food']], 'best_entity': ('CHEBI:33290', 1.6267142776939445)}]


In [33]:
text = """
Coronavirus disease (COVID-19) first presented in Wuhan, Hubei province, China in December 2019. Since then, it has rapidly spread across the world, and is now formally considered a pandemic. The disease does not discriminate but increasing age and the presence of comorbidities are associated with severe form of the disease and poor outcomes. Although the prevalence of COVID-19 in patients with cardiovascular disease is under-reported, there is evidence that pre-existing cardiac disease can render individuals vulnerable. It is thought that COVID-19 may have both a direct and indirect effect on the cardiovascular system; however, the primary mechanism of underlying cardiovascular involvement is still uncertain. Of particular interest is the role of angiotensin-converting enzyme 2, which is well known for its cardiovascular effects and is also considered to be important in the pathogenesis of COVID-19. With a range of different drug candidates being suggested, effective anti-virals and vaccines are an area of on-going research. While our knowledge of COVID-19 continues to rapidly expand, this review highlights recent advances in our understanding of the interaction between COVID-19 and the cardiovascular system. 
"""
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
print(mentions)


2021-12-14 16:51:51,609 Ignore 1 sentence(s) with no tokens.


Coronavirus[41m[30m diseas[0me (COVID-19) first presented in Wuhan, Hubei [42m[30mprovince[0m, China in December 2019. Since then, it has rapidly spread across the world, and is now formally considered a pandemic. The[43m[30m diseas[0me does not [44m[30mdiscriminate[0m but increasing [45m[30mage[0m and the [46m[30mpresence[0m of comorbidities are [41m[30massociated[0m with [42m[30msevere[0m form of the [43m[30mdisease[0m and poor outcomes. Although the [44m[30mprevalence[0m of COVID-19 in patients with cardiovascular [45m[30mdisease[0m is under-reported, there is [46m[30mevidence[0m that [41m[30mpre[0m-existing cardiac[42m[30m diseas[0me can render individuals vulnerable. It is thought that COVID-19 may have both a direct and indirect effect on the cardiovascular [43m[30msystem[0m; however, the primary mechanism of underlying cardiovascular involvement is still uncertain. Of part

In [34]:
docs = ["Zika virus (ZIKV) is an arthropod-borne virus (arbovirus) in the genus Flavivirus and the family Flaviviridae. ZIKV was first isolated from a nonhuman primate in 1947 and from mosquitoes in 1948 in Africa, and ZIKV infections in humans were sporadic for half a century before emerging in the Pacific and the Americas. ZIKV is usually transmitted by the bite of infected mosquitoes. The clinical presentation of Zika fever is nonspecific and can be misdiagnosed as other infectious diseases, especially those due to arboviruses such as dengue and chikungunya. ZIKV infection was associated with only mild illness prior to the large French Polynesian outbreak in 2013 and 2014, when severe neurological complications were reported, and the emergence in Brazil of a dramatic increase in severe congenital malformations (microcephaly) suspected to be associated with ZIKV. Laboratory diagnosis of Zika fever relies on virus isolation or detection of ZIKV-specific RNA. Serological diagnosis is complicated by cross-reactivity among members of the Flavivirus genus. The adaptation of ZIKV to an urban cycle involving humans and domestic mosquito vectors in tropical areas where dengue is endemic suggests that the incidence of ZIKV infections may be underestimated. There is a high potential for ZIKV emergence in urban centers in the tropics that are infested with competent mosquito vectors such as Aedes aegypti and Aedes albopictus. ",
 "Zika virus is a mosquitoborne flavivirus that is the focus of an ongoing pandemic and public health emergency. Previously limited to sporadic cases in Africa and Asia, the emergence of Zika virus in Brazil in 2015 heralded rapid spread throughout the Americas. Although most Zika virus infections are characterized by subclinical or mild influenza-like illness, severe manifestations have been described, including Guillain-Barre syndrome in adults and microcephaly in babies born to infected mothers. Neither an effective treatment nor a vaccine is available for Zika virus; therefore, the public health response primarily focuses on preventing infection, particularly in pregnant women. Despite growing knowledge about this virus, questions remain regarding the virus's vectors and reservoirs, pathogenesis, genetic diversity, and potential synergistic effects of co-infection with other circulating viruses. These questions highlight the need for research to optimize surveillance, patient management, and public health intervention in the current Zika virus epidemic. ",
 "The emergence of Zika virus (ZIKV) as a major public health threat has focused research on understanding virus biology and developing a suite of strategies for disease intervention. Recent advances in cryoelectron microscopy have accelerated structure-function studies of flaviviruses and of ZIKV in particular. Structures of the mature and immature ZIKV have demonstrated its similarity with other known flaviviruses such as dengue and West Nile viruses. However, ZIKV's unique pathobiology demands an explanation of how its structure, although similar to its flavivirus relatives, is sufficiently unique to address questions of receptor specificity, transmission, and antigenicity. Progress in defining the immunodominant epitopes and how neutralizing antibodies bind to them will provide great insight as vaccines progress through clinical trials. Identification of host receptors will substantially illuminate the interesting ZIKV tropism and provide insights into pathogenesis. Although the answers to all of these questions are not yet available, rapid progress in combining structural biology with other techniques is revealing the similarities and the differences in virion structure and function between ZIKV and related flaviviruses. ",
 "AIDS is a syndrome that represents the most severe form of infection with the retrovirus HIV. Opportunistic infections, uncommon malignant neoplasms, and intractable immunologic deficiency are hallmarks of AIDS. To date, the syndrome in the U.S. is seen mainly in epidemiologically restricted populations. It appears to be transmitted in a manner analogous to hepatitis B. There is no available means of reversing the immune deficit. Therapy is restricted to treating the complicating infections and tumors. Those having been diagnosed with P. carinii pneumonia may benefit from therapy with AZT. The spread of HIV and the syndrome can only be reduced by preventive measures until an effective and safe vaccine can be developed. ",
 "Women are the fastest growing segment of people with acquired immunodeficiency syndrome (AIDS), yet they often receive this diagnosis when the disease is in its advanced stages. New therapies have caused human immunodeficiency virus (HIV) to become a chronic and treatable disease for many of those afflicted. Primary care providers must be cognizant of the initial symptoms to facilitate early diagnosis and prompt treatment for women with HIV. Early signs of HIV in women are subtle. Providers must consider a diagnosis of HIV in women who present with vaginal infections, abnormal pap smears, or sexually transmitted diseases that are unusually severe, recurrent, and resistant to treatment efforts. These signs and symptoms, along with a corroborating history, may be early clues to HIV. Primary care providers, in conjunction with HIV specialists, must strive to decrease the incidence, morbidity, and mortality of the disease in women."
 "1. Acquired immunodeficiency syndrome (AIDS) in the elderly accounts for 10% of all AIDS cases; although blood transfusion is thought to be the most likely mode of transmission in the elderly, AIDS acquired through sexual contact should not be overlooked. 2. Diagnosing AIDS in the elderly is a major problem; perhaps the biggest problem in diagnosing AIDS in elders is that many clinicians fail to recognize AIDS as a possibility. 3. The nursing implications concerning AIDS in the elderly are endless. Practitioners, educators, and researchers can have a major impact in combating this problem. ",
 " Another aspect of AIDS epidemic is the public reaction or treatment of the disease and its victims and the potential for discriminatory actions. The issues of discrimination and treatment are compounded by the fact that AIDS is a disease transmitted sexually and that the disease has hit fringe groups such as homosexuals and intravenous drug users the hardest. The HIV epidemic itself is divided sharply by geography, racial, and gender lines. In the U.S., minorities and intravenous drug users are disproportionately represented according to transmission. Blacks and Hispanics, while representing only 20% of the population, make up 40% of all AIDS cases in the U.S.. Distribution of AIDS cases among the states is similarly lopsided. There is also a great gap between awareness and action. Although 90% of the respondents in a national poll understood that transmission could come through intercourse or needle sharing with an infected individual, the same percentage believed that they were in a low or non-existent risk group. The prevention of the disease requires special attention to minorities; however, the misconception that AIDS is a minority disease is an incendiary one. The public and the medical community must recognize that anyone who engages in risky sexual or drug-related behavior is at risk. AIDS has placed economic, emotional and physical stress on our economy, our work and our lives; the question of how we will deal with this stress and how, as a society, our attitudes will change still remains. ",
 "Cancer is the leading cause of death for HIV-infected persons in economically developed countries, even in the era of antiretroviral therapy (ART). Lymphomas remain a leading cause of cancer morbidity and mortality for HIV-infected patients and have increased incidence even in patients optimally treated with ART. Even limited interruptions of ART can lead to CD4 cell nadirs and HIV viremia, and increase the risk of lymphoma. The treatment of lymphoma is now similar for HIV-infected patients and the general population: patients with good HIV control can withstand intensive therapies appropriate to the lymphoma, including autologous and even allogeneic hematopoietic stem cell transplantation. Nonetheless, HIV-related lymphomas have unique aspects, including differences in lymphoma pathogenesis, driven by the presence of HIV, in addition to coinfection with oncogenic viruses. These differences might be exploited in the future to inform therapies. The relative incidences of lymphoma subtypes also differ in the HIV-infected population, and the propensity to advanced stage, aggressive presentation, and extranodal disease is higher. Other unique aspects include the need to avoid potential interactions between ART and chemotherapeutic agents, and the need for HIV-specific supportive care, such as infection prophylaxis. Despite these specific challenges for cancer treatment in the setting of HIV infection, the care of these patients has progressed sufficiently that recent guidelines from the American Society of Clinical Oncology advocate the inclusion of HIV-infected patients alongside HIV- patients in cancer clinical trials when appropriate. ",
 "HIV is a devastating disease affecting millions of people worldwide despite the advent of successful antiretroviral therapy (ART). However, ART does not result in a cure and has to be taken for life. Accordingly, researchers are turning towards cure efforts, particularly in the light of two patients whose HIV has been seemingly eradicated. Numerous approaches and strategies have been considered for curing HIV, but no scalable and safe solution has yet been reached. With newly discovered difficulties in measuring the HIV reservoir, the main barrier to a cure, the only true test of cure is to stop ART and see whether the virus becomes detectable. However, it is possible that this treatment interruption may be associated with certain risks for patients. Here, we compare the current major approaches and recent advances for curing HIV, as well as discuss ways of evaluating HIV cure and the safety concerns involved. ",
 "Combination antiretroviral therapy (ART) has significantly reduced the morbidity and mortality resulting from HIV infection. ART is, however, unable to eradicate HIV, which persists latently in several cell types and tissues. Phylogenetic analyses suggested that the proliferation of cells infected before ART initiation is mainly responsible for residual viremia, although controversy still exists. Conversely, it is widely accepted that drug resistance mutations (DRMs) do not appear during ART in patients with suppressed viral loads. Studies based on sequence clustering have in fact indicated that, at least in developed countries, HIV-infected ART-naive patients are the major source of drug-resistant viruses. Analysis of longitudinally sampled sequences have also shown that DRMs have variable fitness costs, which are strongly influenced by the viral genetic background. "
 ]


In [35]:
%%time
linked_docs = []
for doc in docs :
    linked_docs.append(e_linker.link_entities(doc))



2021-12-14 16:52:54,879 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:54,955 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,015 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,076 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,112 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,190 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,255 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,336 Ignore 1 sentence(s) with no tokens.
2021-12-14 16:52:55,380 Ignore 1 sentence(s) with no tokens.
CPU times: user 427 ms, sys: 121 ms, total: 548 ms
Wall time: 544 ms


In [9]:
linked_docs

[[{'start_pos': 5,
   'end_pos': 10,
   'text': 'virus',
   'entities': [['NCBITaxon:10239', 0.9909885886035129, 'viruses'],
    ['SIO:010379', 0.009034000186803164, 'virus']],
   'best_entity': ('NCBITaxon:10239', 0.9909885886035129)},
  {'start_pos': 40,
   'end_pos': 45,
   'text': 'virus',
   'entities': [['NCBITaxon:10239', 0.9909885886035129, 'viruses'],
    ['SIO:010379', 0.009034000186803164, 'virus']],
   'best_entity': ('NCBITaxon:10239', 0.9909885886035129)},
  {'text': 'genus',
   'end_pos': 70,
   'start_pos': 65,
   'entities': [['NCBITaxon:genus', 1.036875205523338, 'genus']],
   'best_entity': ('NCBITaxon:genus', 1.036875205523338)},
  {'start_pos': 90,
   'text': 'family',
   'end_pos': 96,
   'entities': [['NCBITaxon:family', 0.3419972596866919, 'family'],
    ['SIO:001063', 0.3419972596866919, 'family'],
    ['STATO:0000257',
     0.6839746230331244,
     'common linked ancestor number demonstrated group stipulated groups descent adoption family domestic marriage']],