In [1]:
%load_ext autoreload
%autoreload 2


In [8]:
"""Computing the score 
    score(e;q,m) = P (e|q,m) ∝ P (e|m)P (q|e) 
"""
from termcolor import colored
from tabulate import tabulate
import pickle5 as pickle

import time
from entity_linker import EntityLinker, get_mentions_ner
from flair.models import SequenceTagger
from entity_ranking import DictionaryRanking, QueryEntityRanking
from utils import _print_colorful_text


In [3]:
#loading dicitonary of commonness,
print('Loading mention2pem dictionary ...')
handle = open('../data/pem/pem.pickle', 'rb')
mention2pem = pickle.load(handle)


print('Loading entity description dictionary ...')
handle_desc = open('../data/pem/entity2description.pickle', 'rb')
entity2description = pickle.load(handle_desc)
print('NUmber of entities: ', len(entity2description))

print('Loading dictionary of term frequency ...')
handle_desc = open('../data/pem/mention_freq.pickle', 'rb')
mention2freq = pickle.load(handle_desc)
print('Number of term in the collection: ', len(mention2freq))

#given by create_term_req
collection_size_terms = len(mention2pem)


Loading mention2pem dictionary ...
Loading entity description dictionary ...
NUmber of entities:  2680002
Loading dictionary of term frequency ...
Number of term in the collection:  3506008


In [4]:
list(mention2pem['reproduction'].keys())

['GO:0000003']

In [5]:
list(mention2pem['reproduction'].values())


[1.0]

In [6]:
entity2description['GO:0005578']


'#UNK'

In [7]:
entity2description['GO:0031012']


('structure',
 'cells',
 'biochemical',
 'tissues',
 'biomechanical',
 'lying',
 'proteinaceous',
 'structural',
 'extracellular',
 'support',
 'matrisome',
 'cues',
 'matrix',
 'external',
 'provides')

In [4]:
# load the NER tagger
tagger = SequenceTagger.load(
    '../resources/taggers/sota-ner-flair/best-model.pt')


2021-09-13 11:16:58,088 loading file ../resources/taggers/sota-ner-flair/best-model.pt


In [5]:
text='quaternary ammonium salt that is the monoiodide. sexual reproduction'
get_mentions_ner(text,tagger)

(['quaternary ammonium salt'],
 [{'text': 'quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}])

In [9]:
dictionarysearch_strategy = DictionaryRanking(mention2pem) 
queryranking_strategy = QueryEntityRanking(
    entity2description=entity2description,
    mention_freq=mention2freq,
    mention2pem=mention2pem
    )
e_linker = EntityLinker(
    ranking_strategy=queryranking_strategy,
    ner_model=tagger,
    mention2pem=mention2pem
    )


In [10]:
text = 'a quaternary ammonium salt that is the monoiodide'
samples = e_linker.get_mentions_by_tokens_and_dict(text)
_print_colorful_text(text,samples)
samples


a quaternary [41m[30mammonium[0m [42m[30msalt[0m that is the monoiodide



[{'text': 'ammonium', 'start_pos': 13, 'end_pos': 21},
 {'text': 'salt', 'start_pos': 22, 'end_pos': 26}]

In [79]:
samples.append({'text': 'salt', 'start_pos': 22, 'end_pos': 26})
print(samples)
[dict(s) for s in set(frozenset(d.items()) for d in samples)]


[{'text': 'quaternary', 'start_pos': 2, 'end_pos': 12}, {'text': 'ammonium', 'start_pos': 13, 'end_pos': 21}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}, {'text': 'monoiodide', 'start_pos': 39, 'end_pos': 49}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}]


[{'end_pos': 12, 'start_pos': 2, 'text': 'quaternary'},
 {'start_pos': 13, 'end_pos': 21, 'text': 'ammonium'},
 {'start_pos': 39, 'text': 'monoiodide', 'end_pos': 49},
 {'start_pos': 22, 'end_pos': 26, 'text': 'salt'}]

In [11]:
e_linker.link_entities('a quaternary ammonium salt that is the monoiodide')

NER mentions: [{'text': 'quaternary ammonium salt', 'start_pos': 2, 'end_pos': 26}]
token mentions [{'text': 'ammonium', 'start_pos': 13, 'end_pos': 21}, {'text': 'salt', 'start_pos': 22, 'end_pos': 26}]
Analizing mentions:


[{'text': 'salt', 'start_pos': 22, 'end_pos': 26},
 {'end_pos': 26, 'text': 'quaternary ammonium salt', 'start_pos': 2},
 {'text': 'ammonium', 'end_pos': 21, 'start_pos': 13}]

In [83]:
e_linker.link_entities('sexual reproduction')

NER mentions: [{'text': 'reproduction', 'start_pos': 7, 'end_pos': 19}]
Analizing mentions:

sexual [41m[30mreproduction[0m



[{'text': 'reproduction', 'start_pos': 7, 'end_pos': 19}]

In [87]:
e_linker.link_entities('Quaternary ammonium salt that is the monoiodide')

NER mentions: [{'text': 'ammonium salt', 'start_pos': 11, 'end_pos': 24}]
token mentions [{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19}, {'text': 'salt', 'start_pos': 20, 'end_pos': 24}]
Analizing mentions:


[{'start_pos': 11, 'end_pos': 24, 'text': 'ammonium salt'},
 {'end_pos': 24, 'start_pos': 20, 'text': 'salt'},
 {'start_pos': 11, 'end_pos': 19, 'text': 'ammonium'}]

In [46]:
e_linker.link_entities(' Quaternary ammonium salt that is the monoiodide. Sexual reproduction')


Analysing sentence:  Quaternary ammonium salt that is the monoiodide
NER mentions: [{'text': 'ammonium salt', 'start_pos': 12, 'end_pos': 25}]
Analysing sentence:  Sexual reproduction
NER mentions: [{'text': 'ammonium salt', 'start_pos': 12, 'end_pos': 25}, {'text': 'reproduction', 'start_pos': 57, 'end_pos': 69}]
token mentions [{'text': 'ammonium', 'start_pos': 12, 'end_pos': 20}, {'text': 'salt', 'start_pos': 21, 'end_pos': 25}, {'text': 'reproduction', 'start_pos': 57, 'end_pos': 69}]
Analizing mentions:


[{'text': 'ammonium salt',
  'end_pos': 25,
  'start_pos': 12,
  'entities': [('CHEBI:47704', 21.193757607892252)],
  'best_entity': ('CHEBI:47704', 21.193757607892252)},
 {'text': 'ammonium',
  'end_pos': 20,
  'start_pos': 12,
  'entities': [('CHEBI:28938', 4.016272829206154)],
  'best_entity': ('CHEBI:28938', 4.016272829206154)},
 {'end_pos': 69,
  'text': 'reproduction',
  'start_pos': 57,
  'entities': [('GO:0000003', 4.050511345259044)],
  'best_entity': ('GO:0000003', 4.050511345259044)},
 {'start_pos': 21,
  'text': 'salt',
  'end_pos': 25,
  'entities': [('CHEBI:26710', 0.375), ('CHEBI:24866', 627.8886372127118)],
  'best_entity': ('CHEBI:24866', 627.8886372127118)}]

In [41]:
e_linker.link_entities(
    'Quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced')


Analysing sentence: Quaternary ammonium salt that is the monoiodide that can be found in some species with sexual reproduction reduced
NER mentions: [{'text': 'Quaternary ammonium salt', 'start_pos': 0, 'end_pos': 24}]
token mentions [{'text': 'ammonium', 'start_pos': 11, 'end_pos': 19}, {'text': 'salt', 'start_pos': 20, 'end_pos': 24}, {'text': 'species', 'start_pos': 74, 'end_pos': 81}, {'text': 'reproduction', 'start_pos': 94, 'end_pos': 106}, {'text': 'reduced', 'start_pos': 107, 'end_pos': 114}]
Analizing mentions:

[41m[30mQuaternary ammonium salt[0m that is the monoiodide that can be found in some species with sexual reproduction reduced



[{'end_pos': 24, 'text': 'Quaternary ammonium salt', 'start_pos': 0}]

In [47]:
e_linker.link_entities('A molecular entity having a net electric.')


Analysing sentence: A molecular entity having a net electric
NER mentions: [{'text': 'molecular entity', 'start_pos': 2, 'end_pos': 18}]
Analysing sentence: 
NER mentions: [{'text': 'molecular entity', 'start_pos': 2, 'end_pos': 18}]
token mentions [{'text': 'entity', 'start_pos': 12, 'end_pos': 18}, {'text': 'net', 'start_pos': 28, 'end_pos': 31}]
Analizing mentions:


[{'end_pos': 31,
  'start_pos': 28,
  'text': 'net',
  'entities': [('GO:0140644', 6.664883535521932)],
  'best_entity': ('GO:0140644', 6.664883535521932)},
 {'end_pos': 18,
  'text': 'molecular entity',
  'start_pos': 2,
  'entities': [('CHEBI:23367', 2.667553377723604)],
  'best_entity': ('CHEBI:23367', 2.667553377723604)},
 {'end_pos': 18,
  'text': 'entity',
  'start_pos': 12,
  'entities': [('SIO:000000', 4.484892336351237),
   ('BFO:0000001', 1.89659626164107)],
  'best_entity': ('BFO:0000001', 1.89659626164107)}]

In [24]:
e_linker.link_entities('theres molecular entity with electric charge')


theres molecular entity with electric charge
Analizing mentions: ['molecular entity']


{'molecular entity': ('CHEBI:23367', 3.245904530608999)}

In [25]:
text = 'The fluoride salt of beryllium (+2 oxidation state). In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres. As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. In protein crystallography it is used as a mimic of phosphate.'
e_linker.link_entities(text)

the fluoride salt of beryllium (+2 oxidation state). in the solid state it exists as a glass, with four-coordinate be(2+) tetrahedral centres and two-coordinate fluoride centres. as a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure. in protein crystallography it is used as a mimic of phosphate.
Analizing mentions: ['fluoride salt']


{'fluoride salt': ('CHEBI:24060', 2.4392465042187164)}

In [26]:
e_linker.link_entities('The fluoride salt of beryllium (+2 oxidation state)')


the fluoride salt of beryllium (+2 oxidation state)
Analizing mentions: ['fluoride salt']


{'fluoride salt': ('CHEBI:24060', 41.07384692550316)}

In [33]:
e_linker.link_entities(
    'In the solid state it exists as a glass, with four-coordinate Be(2+) tetrahedral centres and two-coordinate fluoride centres.')


Anlysing sentence: in the solid state it exists as a glass, with four-coordinate be(2+) tetrahedral centres and two-coordinate fluoride centres.
Analizing mentions: []


{}

In [34]:
e_linker.link_entities(
    ' As a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.')


Anlysing sentence: as a gas it adopts a linear triatomic structure and in the liquid state a fluctuating tetrahedral structure.
Analizing mentions: []


{}

In [32]:
e_linker.link_entities('In protein crystallography it is used as a mimic of phosphate.')

Anlysing sentence: in protein crystallography it is used as a mimic of phosphate.
Analizing mentions: []


{}

In [42]:
text = """The process in which an antigen-presenting cell expresses peptide antigen in 
association with an MHC protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the MHC protein complex.
 The peptide antigen is typically, but not always, processed from an endogenous or exogenous protein."""

e_linker.link_entities(text)


Anlysing sentence: the process in which an antigen-presenting cell expresses peptide antigen in 
association with an mhc protein complex on its cell surface, including proteolysis and 
transport steps for the peptide antigen both prior to and following assembly with the mhc protein complex.
Anlysing sentence: 
 the peptide antigen is typically, but not always, processed from an endogenous or exogenous protein.
Analizing mentions: ['peptide', 'cell', 'complex', 'association', 'antigen', 'proteolysis', 'transport', 'process', 'protein', 'surface', 'process', 'peptide antigen']


{'peptide': ('SIO:001425', 0.00011945268624484184),
 'cell': ('ZFA:0009000', 0.6396418482908037),
 'complex': ('PATO:0001504', 1.5625476340619513),
 'association': ('SIO:000897', 1.5554851540554113),
 'antigen': ('SIO:010419', 0.2704365459170655),
 'proteolysis': ('GO:0006508', 1.4488989004822508),
 'transport': ('GO:0006810', 2.118081807878947),
 'process': ('ZFA:0001637', 0.022207348007965165),
 'protein': ('SIO:010043', 0.2882234381862722),
 'surface': ('BSPO:0000005', 3.3331331652821814),
 'peptide antigen': ('CHEBI:166824', 1.5481646623205014)}

In [43]:
e_linker.link_entities('A tiger is a carnovire of the mammal family.')

Anlysing sentence: a tiger is a carnovire of the mammal family.
Analizing mentions: ['tiger', 'mammal', 'family']


{'tiger': ('NCBITaxon:9694', 16.439691350989236),
 'mammal': ('NCBITaxon:40674', 1.0),
 'family': ('STATO:0000257', 9.707100448364667)}

In [46]:

text = """Natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves. 
However, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide. 
The conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years. 
One of the most challenging processes of high industrial importance is the conversion of methane to methanol (CHEBI:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1]. 
Methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2]."""
e_linker.link_entities(text)


Anlysing sentence: natural gas consists primarily of methane that is clean, non-toxic, and has abundant natural reserves.
Anlysing sentence: 
however, methane is also a greenhouse gas whose greenhouse effect is more than 20 times than that of carbon dioxide.
Anlysing sentence: 
the conversion of methane into other value-added chemicals has been an important research area in the field of catalysis for many years.
Anlysing sentence: 
one of the most challenging processes of high industrial importance is the conversion of methane to methanol (chebi:17790), 
a simple alcohol that is liquid under ambient conditions and can be easily stored and transported compared to methane [1].
Anlysing sentence: 
methanol is used as an important chemical raw material to make products such as paints and plastics and as an additive to gasoline [2].
Analizing mentions: ['field', 'methane', 'alcohol', 'methanol', 'simple', 'processes', 'area', 'gas', 'carbon', 'liquid', 'toxic']


{'field': ('UBERON:0007688', 1.1765244958215784),
 'methane': ('CHEBI:16183', 2.051886417753274),
 'alcohol': ('CHEBI:16236', 0.0005115270212291953),
 'methanol': ('CHEBI:17790', 1.2641306724996488),
 'simple': ('PATO:0001503', 1.2370289502089358),
 'processes': ('UBERON:0004529', 1.1592570353598035),
 'area': ('SIO:000045', 0.6359842547183276),
 'gas': ('PATO:0001737', 0.613474414998096),
 'carbon': ('CHEBI:27594', 0.3731910408796742),
 'liquid': ('PATO:0001735', 1.2884007958961678),
 'toxic': ('SIO:001009', 1.3050453744086419)}

In [47]:
from utils import _print_colorful_text

In [28]:
text = 'A tiger is a carnovire. of the mammal family.'
text.split('.')

['A tiger is a carnovire', ' of the mammal family', '']