In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
"""Computing the score 
    score(e;q,m) = P (e|q,m) ∝ P (e|m)P (q|e) 
"""
from termcolor import colored
from tabulate import tabulate
import pickle5 as pickle

import time
from entity_linker import EntityLinker, get_mentions_ner
from flair.models import SequenceTagger
from entity_ranking import DictionaryRanking, QueryEntityRanking
from utils import _print_colorful_text


In [3]:
#loading dicitonary of commonness,
print('Loading mention2pem dictionary ...')
handle = open('../data/pem/pem.pickle', 'rb')
mention2pem = pickle.load(handle)


print('Loading entity description dictionary ...')
handle_desc = open('../data/pem/entity2description.pickle', 'rb')
entity2description = pickle.load(handle_desc)
print('NUmber of entities: ', len(entity2description))

print('Loading dictionary of term frequency ...')
handle_desc = open('../data/pem/mention_freq.pickle', 'rb')
mention2freq = pickle.load(handle_desc)
print('Number of term in the collection: ', len(mention2freq))

#given by create_term_req
collection_size_terms = len(mention2pem)


Loading mention2pem dictionary ...
Loading entity description dictionary ...
NUmber of entities:  2680002
Loading dictionary of term frequency ...
Number of term in the collection:  3506008


In [4]:
list(mention2pem['reproduction'].keys())

['GO:0000003']

In [5]:
list(mention2pem['reproduction'].values())


[1.0]

In [6]:
entity2description['GO:0005578']


'#UNK'

In [7]:
entity2description['GO:0031012']


('structure',
 'cells',
 'biochemical',
 'tissues',
 'biomechanical',
 'lying',
 'proteinaceous',
 'structural',
 'extracellular',
 'support',
 'matrisome',
 'cues',
 'matrix',
 'external',
 'provides')

In [4]:
# load the NER tagger
tagger = SequenceTagger.load(
    '../resources/taggers/sota-ner-flair/best-model.pt')


2021-09-14 10:17:47,655 loading file ../resources/taggers/sota-ner-flair/best-model.pt


In [5]:
text='quaternary ammonium salt that is the monoiodide. sexual reproduction'
get_mentions_ner(text,tagger)

(['quaternary ammonium salt'],
 [{'text': 'quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}])

In [76]:
dictionarysearch_strategy = DictionaryRanking(mention2pem) 
queryranking_strategy = QueryEntityRanking(
    entity2description=entity2description,
    mention_freq=mention2freq,
    mention2pem=mention2pem
    )
e_linker = EntityLinker(
    ranking_strategy=queryranking_strategy,
    ner_model=tagger,
    mention2pem=mention2pem
    )


In [57]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text,samples)
# samples


a quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



In [58]:
text='sexual reproduction'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)



sexual [41m[30mreproduction[0m



In [62]:
text='Quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text, samples)
samples


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



[{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19},
 {'text': 'salt', 'start_pos': 20, 'end_pos': 24}]

In [63]:
text = 'Quaternary ammonium salt that is the monoiodide. Sexual reproduction'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide. Sexual [43m[30mreproduction[0m



[{'text': 'reproduction',
  'start_pos': 56,
  'end_pos': 68,
  'entities': [('GO:0000003', 5.114009083705022)],
  'best_entity': ('GO:0000003', 5.114009083705022)},
 {'start_pos': 20,
  'text': 'salt',
  'end_pos': 24,
  'entities': [('CHEBI:26710', 0.375), ('CHEBI:24866', 790.2300610124876)],
  'best_entity': ('CHEBI:24866', 790.2300610124876)},
 {'end_pos': 19,
  'start_pos': 11,
  'text': 'ammonium',
  'entities': [('CHEBI:28938', 5.06361185964469)],
  'best_entity': ('CHEBI:28938', 5.06361185964469)}]

In [64]:
text='quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide that can be found in some [43m[30mspecies[0m with sexual [44m[30mreproduction[0m [45m[30mreduced[0m



[{'start_pos': 20,
  'text': 'salt',
  'end_pos': 24,
  'entities': [('CHEBI:26710', 0.375), ('CHEBI:24866', 462.08449194900214)],
  'best_entity': ('CHEBI:24866', 462.08449194900214)},
 {'start_pos': 107,
  'text': 'reduced',
  'end_pos': 114,
  'entities': [('PATO:0000587', 1.919248235352937),
   ('PATO:0001997', 1.2794988235686249)],
  'best_entity': ('PATO:0001997', 1.2794988235686249)},
 {'end_pos': 19,
  'start_pos': 11,
  'text': 'ammonium',
  'entities': [('CHEBI:28938', 2.948765741069345)],
  'best_entity': ('CHEBI:28938', 2.948765741069345)},
 {'start_pos': 94,
  'text': 'reproduction',
  'end_pos': 106,
  'entities': [('GO:0000003', 2.968299120520138)],
  'best_entity': ('GO:0000003', 2.968299120520138)},
 {'end_pos': 81,
  'text': 'species',
  'start_pos': 74,
  'entities': [('OBI:0100026', 0.998766954377312),
   ('MCCV_000003', 0.002679679196868865),
   ('NCBITaxon:species', 0.0013398395984344324)]}]

In [65]:
text='A molecular entity having a net electric.'
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



A [41m[30mmolecular entity[0m having a [42m[30mnet[0m electric.



[{'text': 'net',
  'end_pos': 31,
  'start_pos': 28,
  'entities': [('GO:0140644', 6.664883535521932)],
  'best_entity': ('GO:0140644', 6.664883535521932)},
 {'text': 'molecular entity',
  'start_pos': 2,
  'end_pos': 18,
  'entities': [('CHEBI:23367', 2.667553377723604)],
  'best_entity': ('CHEBI:23367', 2.667553377723604)}]

In [68]:
text="there's a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions



there's a [41m[30mmolecular entity[0m with electric charge



[{'end_pos': 26,
  'text': 'molecular entity',
  'start_pos': 10,
  'entities': [('CHEBI:23367', 4.356820601798991)],
  'best_entity': ('CHEBI:23367', 4.356820601798991)}]

In [85]:
text = "theres a molecular entity with electric charge"
mentions = e_linker.link_entities(text)
_print_colorful_text(text, mentions)
mentions


('Analysing sentence:', 'theres a molecular entity with electric charge')
('NER mentions:', [{'text': 'molecular entity', 'start_pos': 9, 'end_pos': 25}])
('token mentions', [{'text': 's', 'start_pos': 5, 'end_pos': 6}, {'text': 'entity', 'start_pos': 19, 'end_pos': 25}])
('Analizing mentions:',)
({'start_pos': 9, 'end_pos': 25, 'text': 'molecular entity', 'entities': [('CHEBI:23367', 3.245904530608999)]},)
([('CHEBI:23367', 3.245904530608999)],)
([('CHEBI:23367', 3.245904530608999)],)
({'text': 's', 'start_pos': 5, 'end_pos': 6, 'entities': [('CHEBI:29999', 2.95759679036954), ('CHEBI:17115', 0.3924854915524766), ('CHEBI:17909', 0.5062741561321537), ('CHEBI:26833', 5.185600936147622), ('CHEBI:36368', 0.4814570585727065), ('UO:0000010', 0.4064118505055581)]},)
([('CHEBI:29999', 2.95759679036954), ('CHEBI:17115', 0.3924854915524766), ('CHEBI:17909', 0.5062741561321537), ('CHEBI:26833', 5.185600936147622), ('CHEBI:36368', 0.4814570585727065), ('UO:0000010', 0.4064118505055581)],)
([('CHEB

[{'end_pos': 25,
  'text': 'entity',
  'start_pos': 19,
  'entities': [('SIO:000000', 6.06683833587337),
   ('BFO:0000001', 2.56557840074247)],
  'best_entity': ('SIO:000000', 6.06683833587337)},
 {'text': 's',
  'start_pos': 5,
  'end_pos': 6,
  'entities': [('CHEBI:29999', 2.95759679036954),
   ('CHEBI:17115', 0.3924854915524766),
   ('CHEBI:17909', 0.5062741561321537),
   ('CHEBI:26833', 5.185600936147622),
   ('CHEBI:36368', 0.4814570585727065),
   ('UO:0000010', 0.4064118505055581)],
  'best_entity': ('CHEBI:26833', 5.185600936147622)}]

In [25]:
text = 'The fluoride salt of beryllium (+2 oxidation state). In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres. As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. In protein crystallography it is used as a mimic of phosphate.'
e_linker.link_entities(text)

the fluoride salt of beryllium (+2 oxidation state). in the solid state it exists as a glass, with four-coordinate be(2+) tetrahedral centres and two-coordinate fluoride centres. as a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. in protein crystallography it is used as a mimic of phosphate.
Analizing mentions: ['fluoride salt']


{'fluoride salt': ('CHEBI:24060', 2.4392465042187164)}

In [26]:
e_linker.link_entities('The fluoride salt of beryllium (+2 oxidation state)')


the fluoride salt of beryllium (+2 oxidation state)
Analizing mentions: ['fluoride salt']


{'fluoride salt': ('CHEBI:24060', 41.07384692550316)}

In [33]:
e_linker.link_entities(
    'In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres.')


Anlysing sentence: in the solid state it exists as a glass, with four-coordinate be(2+) tetrahedral centres and two-coordinate fluoride centres.
Analizing mentions: []


{}

In [34]:
e_linker.link_entities(
    ' As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.')


Anlysing sentence: as a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.
Analizing mentions: []


{}

In [32]:
e_linker.link_entities('In protein crystallography it is used as a mimic of phosphate.')

Anlysing sentence: in protein crystallography it is used as a mimic of phosphate.
Analizing mentions: []


{}

In [51]:
text = """The process in which an antigen-presenting cell expresses peptide antigen in 
association with an MHC protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the MHC protein complex.
 The peptide antigen is typically, but not always, processed from an endogenous or exogenous protein."""

e_linker.link_entities(text)




[{'start_pos': 278,
  'end_pos': 293,
  'text': 'peptide antigen',
  'entities': [('CHEBI:166824', 1.5754689061193654)],
  'best_entity': ('CHEBI:166824', 1.5754689061193654)},
 {'end_pos': 205,
  'start_pos': 198,
  'text': 'antigen',
  'entities': [('CHEBI:59132', 2.3005372039812224),
   ('SIO:010419', 0.27572366689345784)],
  'best_entity': ('SIO:010419', 0.27572366689345784)},
 {'end_pos': 109,
  'text': 'protein',
  'start_pos': 102,
  'entities': [('CHEBI:16541', 0.2701529759025882),
   ('CHEBI:36080', 15.931271178001444),
   ('PR:000000001', 113.17678589949696),
   ('SIO:010043', 0.293584399220485)],
  'best_entity': ('SIO:010043', 0.293584399220485)},
 {'start_pos': 4,
  'text': 'process',
  'end_pos': 11,
  'entities': [('BFO:0000015', 1.2460255469628874),
   ('UBERON:0004529', 0.08467235649729467),
   ('ProcessualEntity', 0.046147671421455524),
   ('SIO:000006', 0.5114700249211321),
   ('ZFA:0001637', 0.017610971339665847)],
  'best_entity': ('ZFA:0001637', 0.0176109713396658

In [43]:
e_linker.link_entities('A tiger is a carnovire of the mammal family.')

Anlysing sentence: a tiger is a carnovire of the mammal family.
Analizing mentions: ['tiger', 'mammal', 'family']


{'tiger': ('NCBITaxon:9694', 16.439691350989236),
 'mammal': ('NCBITaxon:40674', 1.0),
 'family': ('STATO:0000257', 9.707100448364667)}

In [46]:

text = """Natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves. 
However, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide. 
The conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years. 
One of the most challenging processes of high industrial importance is the conversion of methane to methanol (CHEBI:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2]."""
e_linker.link_entities(text)


Anlysing sentence: natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves.
Anlysing sentence: 
however, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide.
Anlysing sentence: 
the conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years.
Anlysing sentence: 
one of the most challenging processes of high industrial importance is the conversion of methane to methanol (chebi:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1].
Anlysing sentence: 
methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2].
Analizing mentions: ['field', 'methane', 'alcohol', 'methanol', 'simple', 'processes', 'area', 'gas', 'carbon', 'liquid', 'toxic']


{'field': ('UBERON:0007688', 1.1765244958215784),
 'methane': ('CHEBI:16183', 2.051886417753274),
 'alcohol': ('CHEBI:16236', 0.0005115270212291953),
 'methanol': ('CHEBI:17790', 1.2641306724996488),
 'simple': ('PATO:0001503', 1.2370289502089358),
 'processes': ('UBERON:0004529', 1.1592570353598035),
 'area': ('SIO:000045', 0.6359842547183276),
 'gas': ('PATO:0001737', 0.613474414998096),
 'carbon': ('CHEBI:27594', 0.3731910408796742),
 'liquid': ('PATO:0001735', 1.2884007958961678),
 'toxic': ('SIO:001009', 1.3050453744086419)}

In [43]:
mentions = [{'text': 'reproduction',
             'start_pos': 56,
             'end_pos': 68}
            ,{'start_pos': 20,
             'text': 'salt',
             'end_pos': 24}
            ,{'end_pos': 19,
             'start_pos': 11,
             'text': 'ammonium'}]

text = 'Quaternary ammonium salt that is the monoiodide. Sexual reproduction'
_print_colorful_text(text,mentions)


Quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide. Sexual [43m[30mreproduction[0m



In [42]:
sort_idxs = sorted(range(len(mentions)),
                   key=lambda idx: mentions[idx]['start_pos'])
sort_idxs


[2, 1, 0]

[0, 1, 2]