In [1]:
import json
import yaml
import pandas as pd

from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
with open('results/props.json') as fp:
    raw_data = json.load(fp)

with open('results/props_review.yaml') as fp:
    review_data = yaml.safe_load(fp)

In [3]:
all_to_inspect = len([r for r in raw_data if int(r['c']) < 11])
print(f'All props under 11 uses: {all_to_inspect}')

All props under 11 uses: 140


In [4]:
errors_arr = []
for error_id, v in review_data.items():
    for prop in v['props']:
        errors_arr.append({
            'prop': prop['p'],
            'error_id': error_id,
            'comment': v['comment'],
            'uses': int(prop['c']),
        })

df_err = pd.DataFrame.from_records(errors_arr)

In [5]:
print(f'Erroneous props in total: {len(df_err)}')
print(f'Unique errors: {df_err["uses"].sum()}')

Erroneous props in total: 74
Unique errors: 278


In [6]:
df_err[['error_id', 'uses']]\
    .groupby('error_id')\
    .agg(['count', 'sum'])#.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,uses,uses
Unnamed: 0_level_1,count,sum
error_id,Unnamed: 1_level_2,Unnamed: 2_level_2
badPrefixFobi,4,5
brokenProtocol,1,1
hasDate,1,8
hasVersion,1,10
notAnUri,1,9
oboInOwlUndefined,51,212
uriAppendedToPrefix,4,15
wrongPrefixOio,3,9
wrongPrefixOther,6,7
xmlSchema,2,2


Query the ontologies in sequence to find the issue source

In [7]:
with open('../ontos.txt') as fp:
    ontologies = [l.strip() for l in fp.readlines()]

endpoints = dict()

for ontology in ontologies:
    e = SPARQLWrapper(f'http://127.0.0.1:9999/blazegraph/namespace/obo-{ontology}/sparql')
    e.setRequestMethod('postdirectly')
    e.setMethod('POST')
    e.setReturnFormat(JSON)
    endpoints[ontology] = e

In [8]:
df_err_ont = df_err.copy()
df_err_ont['found_errors'] = 0

for ont in ontologies:
    df_err_ont[ont] = 0

In [9]:
base_query = 'SELECT (COUNT(*) as ?count) WHERE { ?s <p> ?o }'

for ix, r in df_err_ont.iterrows():
    q = base_query.replace('<p>', f'<{r["prop"]}>')
    row_count = 0

    for ont, e in endpoints.items():
        e.setQuery(q)
        ont_err_count = int(e.query().convert()['results']['bindings'][0]['count']['value'])
        df_err_ont.loc[ix, ont] = ont_err_count
        row_count += ont_err_count

    df_err_ont.loc[ix, 'found_errors'] = row_count

Should yield an empty set if we did everything right...

In [10]:
df_err_ont[df_err_ont.uses > df_err_ont.found_errors]
# it does!

Unnamed: 0,prop,error_id,comment,uses,found_errors,aeo,agro,apollo-sv,bfo,bto,...,obi,pato,pco,peco,po,ro,symp,uberon,uo,xco


Errors per ontology – the total will be higher than the number of unique errors due to partial imports

In [11]:
df_err_ont[ontologies].sum()

aeo           4
agro         18
apollo-sv     4
bfo           0
bto           3
caro          1
chebi        12
cl           38
doid          2
dron          9
ehdaa2        3
envo          3
fobi          5
foodon        0
gaz           0
go            1
hp           45
iao           0
mp           47
ncbitaxon     0
obi           0
pato         13
pco           3
peco          2
po            3
ro            2
symp          2
uberon       87
uo            9
xco           3
dtype: int64