In [1]:
import json
import pandas as pd

from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
with open('../ontos.txt') as fp:
    ontologies = [l.strip() for l in fp.readlines()]

endpoints = dict()
for ontology in ontologies:
    e = SPARQLWrapper(f'http://127.0.0.1:9999/blazegraph/namespace/obo-{ontology}/sparql')
    e.setRequestMethod('postdirectly')
    e.setMethod('POST')
    e.setReturnFormat(JSON)
    endpoints[ontology] = e

In [3]:
with open('xrefs_bnodes.rq') as fp:
    bnodes_query = fp.read()

with open('xrefs_oboInOwl.rq') as fp:
    obo_query = fp.read()

In [4]:
bnode_count = dict()

for ontology, e in endpoints.items():
    e.setQuery(bnodes_query)
    data = e.query().convert()['results']['bindings']
    bnode_count[ontology] = len(data)

In [5]:
pd.Series(bnode_count)

aeo            0
agro           0
apollo-sv    214
bfo            0
bto            0
caro           0
chebi          0
cl             0
doid           0
dron           0
ehdaa2         0
envo           0
fobi           0
foodon         0
gaz            0
go             0
hp             0
iao            0
mp             0
ncbitaxon      0
obi            0
pato           0
pco            0
peco           0
po             0
ro             0
symp           0
uberon         0
uo             0
xco            0
dtype: int64

In [6]:
for ontology, e in endpoints.items():
    e.setQuery(obo_query)
    data = e.query().convert()['results']['bindings']
    data = [
        { k: v['value'] for k, v in r.items() }
        for r in data
    ]
    with open(f'results/xrefs_per_ont/xrefs_{ontology}.json', 'w') as fp:
        json.dump(data, fp)
    print(ontology)

aeo
agro
apollo-sv
bfo
bto
caro
chebi
cl
doid
dron
ehdaa2
envo
fobi
foodon
gaz
go
hp
iao
mp
ncbitaxon
obi
pato
pco
peco
po
ro
symp
uberon
uo
xco


In [7]:
def make_po_set(df: pd.DataFrame) -> set:
    return set(df.apply(lambda r: r['p'] + '||' + r['o'], axis=1))

def make_po_df(df: pd.DataFrame) -> pd.DataFrame:
    df1 = df[['s']].copy()
    df1['po'] = df.apply(lambda r: r['p'] + '||' + r['o'], axis=1)
    return df1

In [8]:
po_xrefs_uri = make_po_set(pd.read_json('results/xrefs_uri.json'))
po_xrefs_unknown = make_po_set(pd.read_json('results/xrefs_unknown_id.json'))

In [9]:
ont_issues = dict()
for ontology in ontologies:
    df_ont = pd.read_json(f'results/xrefs_per_ont/xrefs_{ontology}.json', orient='records')
    if len(df_ont) == 0:
        ont_issues[ontology] = {
            'uri': 0,
            'unknown': 0,
        }
        continue

    df_ont = make_po_df(df_ont)
    ont_issues[ontology] = {
        'uri': len(df_ont[df_ont['po'].isin(po_xrefs_uri)]),
        'unknown': len(df_ont[df_ont['po'].isin(po_xrefs_unknown)]),
    }
    print(ontology)

aeo
agro
apollo-sv
bto
caro
chebi
cl
doid
dron
ehdaa2
envo
foodon
gaz
go
hp
mp
ncbitaxon
pato
pco
peco
po
ro
symp
uberon
xco


In [10]:
pd.DataFrame.from_dict(ont_issues, orient='index')


Unnamed: 0,uri,unknown
aeo,10,136
agro,1266,6710
apollo-sv,2,21
bfo,0,0
bto,0,3479
caro,380,1800
chebi,0,313736
cl,2297,34296
doid,1,12824
dron,0,35148
