In [1]:
%reset -f

In [None]:
import pandas as pd
from ontorunner.pre.util import json2tsv, prepare_termlist
from ontorunner.oger_module import run_oger
from ontorunner.spacy_module import run_spacy, run_viz
from os.path import join, pardir
import numpy as np

In [3]:
settings = "settings.ini"
data_dir = "data"
terms_dir = join(data_dir, "terms")
output_dir = join(data_dir, "output")
nodes_and_edges_dir = join(data_dir,'nodes_and_edges')
onto_json = join(terms_dir, "envo.json")

## *.JSON => *_nodes.tsv + *_edges.tsv using KGX

In [4]:
%%time
target_name = join(nodes_and_edges_dir,"envo")
json2tsv(onto_json, target_name)

[KGX][cli_utils.py][    transform_source] INFO: Processing source 'envo.json'
CPU times: user 9.37 s, sys: 144 ms, total: 9.51 s
Wall time: 12.9 s


In [5]:
nodes_filename = target_name+"_nodes.tsv"
nodes_df = pd.read_csv(nodes_filename, sep="\t")
nodes_df.replace(np.NAN, "", inplace=True, regex=True)
nodes_df.head()

Unnamed: 0,id,category,name,description,xref,provided_by,synonym,iri,knowledge_source,object,predicate,relation,same_as,subject,subsets
0,CHEBI:25213,biolink:ChemicalSubstance,metal cation,,,envo.json,,http://purl.obolibrary.org/obo/CHEBI_25213,,,,,,,
1,CHEBI:25212,biolink:ChemicalSubstance,metabolite,,,envo.json,,http://purl.obolibrary.org/obo/CHEBI_25212,,,,,,,
2,CHEBI:25216,biolink:ChemicalSubstance,metalloporphyrin,,,envo.json,,http://purl.obolibrary.org/obo/CHEBI_25216,,,,,,,
3,UBERON:0011595,biolink:AnatomicalEntity,jaw region,A subdivision of the head that corresponds to ...,,envo.json,,http://purl.obolibrary.org/obo/UBERON_0011595,,,,,,,
4,CHEBI:76807,biolink:ChemicalSubstance,EC 3.5.1.* (non-peptide linear amide C-N hydro...,,,envo.json,,http://purl.obolibrary.org/obo/CHEBI_76807,,,,,,,


## Prepare termlist

In [6]:
terms_fn = join(terms_dir, "envo_termlist.tsv")
prepare_termlist(nodes_filename, terms_fn)
terms_df = pd.read_csv(terms_fn, sep="\t", header=None)
terms_df.head()

Unnamed: 0,0,1,2,3,4,5
0,CUI-less,envo.json,CHEBI:25213,metal cation,metal cation,biolink:ChemicalSubstance
1,CUI-less,envo.json,CHEBI:25212,metabolite,metabolite,biolink:ChemicalSubstance
2,CUI-less,envo.json,CHEBI:25216,metalloporphyrin,metalloporphyrin,biolink:ChemicalSubstance
3,CUI-less,envo.json,UBERON:0011595,jaw region,jaw region,biolink:AnatomicalEntity
4,CUI-less,envo.json,CHEBI:76807,EC 3.5.1.* (non-peptide linear amide C-N hydro...,EC 3.5.1.* (non-peptide linear amide C-N hydro...,biolink:ChemicalSubstance


## Run OGER

In [None]:
%%time
run_oger(settings=settings, nodes_and_edges=nodes_and_edges_dir,need_ancestors=True)

In [8]:
orun_df = pd.read_csv(join(output_dir, "test_ontoRunNER.tsv"), sep="\t")
orun_df.head()

Unnamed: 0,document_id,object_category,start_position,end_position,matched_term,preferred_form,object_label,object_label_doc_ratio,matched_term_doc_ratio,match_type,...,jaccard_index,monge_elkan,object_id,pos_and_ne_chunk,sentence_id,umls_cui,origin,sentence,object_sentence_%,ancestors
0,gold:Gb0101224,biolink:ChemicalSubstance,32,41,nutrients,nutrient,nutrient,0.166667,0.166667,lemmatic_match,...,0.111111,1.901235,CHEBI:33284,(S nutrients/NNS),S1,CUI-less,envo.json,Lithgow State Coal Mine Calcium nutrients ear...,0.18,"['CHEBI:78295', 'CHEBI:52211', 'CHEBI:24432', ..."
1,gold:Gb0101224,biolink:ChemicalSubstance,83,92,nutrients,nutrient,nutrient,0.166667,0.166667,lemmatic_match,...,0.111111,1.901235,CHEBI:33284,(S nutrients/NNS),S2,CUI-less,envo.json,Lithgow State Coal Mine Calcium nutrients ear...,0.18,"['CHEBI:78295', 'CHEBI:52211', 'CHEBI:24432', ..."
2,gold:Gb0101224,biolink:ChemicalSubstance,133,138,water,water,water,0.166667,0.166667,exact_match,...,0.0,0.0,CHEBI:15377,(S water/NN),S4,CUI-less,envo.json,Coalbed water,0.384615,"['CHEBI:33693', 'CHEBI:36902', 'CHEBI:33304', ..."
3,gold:Gb0101224,biolink:OntologyClass,14,18,coal,coal,coal,0.166667,0.166667,exact_match,...,0.0,0.0,ENVO:02000091,(S (GPE Coal/NN)),S1,CUI-less,envo.json,Lithgow State Coal Mine Calcium nutrients ear...,0.08,"['ENVO:00002016', 'ENVO:00001995', 'ENVO:01000..."
4,gold:Gb0101224,biolink:OntologyClass,14,23,coal mine,coal mine,coal mine,0.166667,0.166667,exact_match,...,0.0,0.0,ENVO:00002169,(S (PERSON Coal/NNP) (ORGANIZATION Mine/NNP)),S1,CUI-less,envo.json,Lithgow State Coal Mine Calcium nutrients ear...,0.18,"['ENVO:00000076', 'ENVO:00000070', 'ENVO:01001..."


## Spacy

In [9]:
serialized_dir = join(data_dir, "serialized")
linker = "go" # options: go/mesh/rxnorm/hpo/umls

In [None]:
%%time
onto_obj = run_spacy(data_dir=data_dir, settings_file=settings, linker=linker, to_pickle=True)

In [11]:
orun_spacy_df = pd.read_csv(join(output_dir,"ontology_ontoRunNER.tsv"), sep="\t")
orun_spacy_df.replace(np.NAN, "", inplace=True, regex=True)
orun_spacy_df.head()

Unnamed: 0,document_id,matched_term,POS,tag,scispacy_object_category,object_id,object_category,object_label,object_match_field,sentence,start,end,origin,object_label_doc_count,object_label_doc_ratio,matched_term_doc_count,matched_term_doc_ratio,ancestors
0,gold:Gb0101224,coal,PROPN,NNP,CHEBI,ENVO:02000091,biolink:OntologyClass,coal,,Lithgow State Coal Mine Calcium nutrients (ear...,14,18,envo.json,1,0.166667,1,0.166667,"['ENVO:00002016', 'ENVO:00001995', 'ENVO:01000..."
1,gold:Gb0101224,coal,PROPN,NNP,CHEBI,ENVO:02000091,biolink:OntologyClass,coal,,Lithgow State Coal Mine Calcium nutrients (ear...,65,69,envo.json,1,0.166667,1,0.166667,"['ENVO:00002016', 'ENVO:00001995', 'ENVO:01000..."
2,gold:Gb0101253,field,NOUN,NN,ENVO:00000114 [ agricultural field ],UBERON:0007688_SYNONYM,biolink:AnatomicalEntity,anlage,hasRelatedSynonym,Bass Strait oil field metagenomics - A7A-2.,16,21,envo.json,1,0.166667,1,0.166667,
3,gold:Gb0101253,field,NOUN,NN,UBERON:0007688 [ anlage ],UBERON:0007688_SYNONYM,biolink:AnatomicalEntity,anlage,hasRelatedSynonym,Bass Strait oil field metagenomics - A7A-2. mi...,60,65,envo.json,1,0.166667,1,0.166667,
4,gold:Gb0101253,oil,NOUN,NN,FOODON:03310387 [ oil ],ENVO:00002985,biolink:OntologyClass,oil,,Bass Strait oil field metagenomics - A7A-2.,12,15,envo.json,1,0.166667,1,0.166667,"['ENVO:2000045', 'ENVO:00010483', 'BFO:0000024..."


In [12]:
orun_scispacy_go_df = pd.read_csv(join(output_dir, "sciSpacy_go_ontoRunNER.tsv"), sep="\t")
orun_scispacy_go_df.head()

Unnamed: 0,document_id,cui,matched_term,aliases,definition,tui
0,gold:Gb0101224,C1160632,response to nutrient,['response to nutrients'],Any process that results in a change in state ...,['T043']
1,gold:Gb0101224,C1154615,detection of nutrient,"['perception of nutrients', 'nutrient sensing'...",The series of events in which a nutrient stimu...,['T043']
2,gold:Gb0101253,C1166870,oilbody,"['oleosome', 'monolayer-surrounded lipid stora...",A subcellular organelle of plant cells surroun...,['T026']
3,gold:Gb0101317,C1150785,carbon-carbon lyase activity,[],Catalysis of the cleavage of C-C bonds by othe...,['T044']
4,gold:Gb0101317,C1150848,other carbon-carbon lyase activity,[],OBSOLETE. A grouping term for carbon-carbon ly...,['T044']


In [13]:
%%time
text = "Wetland microbial communities from the San Francisco Bay, California, USA,\
    that impact long-term carbon sequestration.\
    Natural and restored wetlands. Cold seep microbial communities from the Ulleung Basin, East Sea, Korea\
    - Hemire mound."

run_viz(text, obj = onto_obj)

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


CPU times: user 381 ms, sys: 16.9 ms, total: 398 ms
Wall time: 386 ms



    PERSON:      People, including fictional.
    NORP:        Nationalities or religious or political groups.
    FAC:         Buildings, airports, highways, bridges, etc.
    ORG:         Companies, agencies, institutions, etc.
    GPE:         Countries, cities, states.
    LOC:         Non-GPE locations, mountain ranges, bodies of water.
    PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
    EVENT:       Named hurricanes, battles, wars, sports events, etc.
    WORK_OF_ART: Titles of books, songs, etc.
    LAW:         Named documents made into laws.
    LANGUAGE:    Any named language.
    DATE:        Absolute or relative dates or periods.
    TIME:        Times smaller than a day.
    PERCENT:     Percentage, including ”%“.
    MONEY:       Monetary values, including unit.
    QUANTITY:    Measurements, as of weight or distance.
    ORDINAL:     “first”, “second”, etc.
    CARDINAL:    Numerals that do not fall under another type.
