In [1]:
import json
import yaml
import re

import pandas as pd
import numpy as np

In [2]:
df_skos = pd.read_json('results/xrefs_skos.json')

In [3]:
def classify_mapping(value: str):
    if value.startswith('http://') or value.startswith('https://'):
        if value.startswith('http://purl.obolibrary.org/obo/'):
            foreign_ns_id = value.replace('http://purl.obolibrary.org/obo/', '')
            ref_type = 'uri_obo'
        else:
            foreign_ns_id = re.sub('.*[/#]', '', value)
            ref_type = 'uri'
    elif re.match('^t\d+$', value):
        ref_type = 'bnode'
        foreign_ns_id = ''
    else:
        foreign_ns_id = value
        ref_type = 'id'

    namespace = ''
    ns_sep = ''
    foreign_id = ''
    match = re.match('([a-zA-Z_\d.]+)([:_])([.A-Z\d]+)', foreign_ns_id)
    if not match:
        match = re.match('([a-zA-Z_\d.]+)([:])(.+)', foreign_ns_id)

    if match:
        namespace = match.group(1)
        ns_sep = match.group(2)
        foreign_id = match.group(3)

    return ref_type, foreign_ns_id, namespace, ns_sep, foreign_id

def analyze_mappings(df: pd.DataFrame):
    df1 = df.copy()
    df1['ref_type'], df1['foreign_ns_id'], df1['namespace'], df1['ns_sep'], df1['foreign_id'] = \
        zip(*df1['o'].apply(classify_mapping))
    agg1 = df1[['o', 'ref_type', 'namespace']].groupby(['ref_type', 'namespace']).agg('count')
    agg2 = df1[['o', 'ref_type']].groupby(['ref_type']).agg('count')
    return df1, agg1, agg2

In [4]:
df1_skos, agg1_skos, agg2_skos = analyze_mappings(df_skos)
agg1_skos

Unnamed: 0_level_0,Unnamed: 1_level_0,o
ref_type,namespace,Unnamed: 2_level_1
id,,1
id,ICD10CM,1
id,MESH,246
id,NCI,5
id,ORDO,2
id,UMLS_CUI,1
uri_obo,MI,3


In [5]:
agg2_skos

Unnamed: 0_level_0,o
ref_type,Unnamed: 1_level_1
id,256
uri_obo,3


In [6]:
df1_skos['ns_sep'].value_counts()

:    255
_      3
       1
Name: ns_sep, dtype: int64

In [7]:
df_alt_id = pd.read_json('results/xrefs_altId.json')

In [8]:
df1_alt, agg1_alt, agg2_alt = analyze_mappings(df_alt_id)
agg1_alt

Unnamed: 0_level_0,Unnamed: 1_level_0,o
ref_type,namespace,Unnamed: 2_level_1
id,,17
id,BSPOTEMP,1
id,BTO,3
id,CHEBI,18502
id,CL,73
id,CMECS,26
id,DOID,1696
id,EHDAA2,157
id,EMAPA,1
id,ENVO,7


In [9]:
agg2_alt

Unnamed: 0_level_0,o
ref_type,Unnamed: 1_level_1
id,99320
uri,17


In [10]:
df_obo = pd.read_json('results/xrefs_altId.json')\
    .append(pd.read_json('results/xrefs_dbxref_1.json'))\
    .append(pd.read_json('results/xrefs_dbxref_2.json'))\
    .append(pd.read_json('results/xrefs_dbxref_3.json'))\
    .append(pd.read_json('results/xrefs_dbxref_4.json'))

In [11]:
df1_obo, agg1_obo, agg2_obo = analyze_mappings(df_obo)

In [12]:
agg1_obo

Unnamed: 0_level_0,Unnamed: 1_level_0,o
ref_type,namespace,Unnamed: 2_level_1
bnode,,214
id,,374092
id,.answers.com,2
id,.thefreedictionary.com,2
id,AAO,2065
...,...,...
uri_obo,NCBITaxon,98
uri_obo,NCIT,6
uri_obo,OMIT,1
uri_obo,TAXRANK,1


In [13]:
agg2_obo

Unnamed: 0_level_0,o
ref_type,Unnamed: 1_level_1
bnode,214
id,3856416
uri,51953
uri_obo,169


In [14]:
df_obo_sep_agg = df1_obo[['ref_type', 'ns_sep', 'o']].groupby(['ref_type', 'ns_sep']).agg('count')
df_obo_sep_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,o
ref_type,ns_sep,Unnamed: 2_level_1
bnode,,214
id,,374092
id,:,3439229
id,_,43095
uri,,51379
uri,:,49
uri,_,525
uri_obo,,57
uri_obo,_,112


In [15]:
test = df1_obo[df1_obo.ns_sep == '_']

Attempt to resolve OBO Foundry prefixes

In [16]:
with open('data/ontologies.yml') as fp:
    ontologies = yaml.safe_load(fp)['ontologies']

In [17]:
df_ontos = pd.DataFrame.from_records(ontologies, columns=['id', 'preferredPrefix', 'alternatePrefix'])

In [18]:
prefixes = set()
for _, r in df_ontos.iterrows():
    prefixes.add(r['id'])
    if isinstance(r['preferredPrefix'], str):
        prefixes.add(r['preferredPrefix'].lower())

In [19]:
agg_valid_prefixes = agg1_obo.reset_index()
agg_valid_prefixes = agg_valid_prefixes[agg_valid_prefixes.namespace.apply(lambda x: x.lower() in prefixes)]

In [20]:
agg_valid_prefixes[['ref_type', 'o']].groupby('ref_type').agg(['sum', 'count'])

Unnamed: 0_level_0,o,o
Unnamed: 0_level_1,sum,count
ref_type,Unnamed: 1_level_2,Unnamed: 2_level_2
id,187167,75
uri_obo,112,8


In [21]:
agg_valid_prefixes[['ref_type', 'o']].groupby(['ref_type']).agg(['sum', 'count'])

Unnamed: 0_level_0,o,o
Unnamed: 0_level_1,sum,count
ref_type,Unnamed: 1_level_2,Unnamed: 2_level_2
id,187167,75
uri_obo,112,8


In [22]:
all_valid_pref = df1_obo[df1_obo.namespace.apply(lambda x: x.lower() in prefixes)]

In [23]:
all_valid_pref[['ref_type', 'ns_sep', 'o']].groupby(['ref_type', 'ns_sep']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,o
ref_type,ns_sep,Unnamed: 2_level_1
id,:,187148
id,_,19
uri_obo,_,112


In [24]:
all_valid_pref[(all_valid_pref.ref_type == 'id') & (all_valid_pref.ns_sep == '_')]

Unnamed: 0,p,s,o,ref_type,foreign_ns_id,namespace,ns_sep,foreign_id
397952,http://www.geneontology.org/formats/oboInOwl#h...,http://purl.obolibrary.org/obo/UBERON_0001572,MFMO_0000064,id,MFMO_0000064,MFMO,_,64
684507,http://www.geneontology.org/formats/oboInOwl#h...,t8572278,UBERON_0003707,id,UBERON_0003707,UBERON,_,3707
693346,http://www.geneontology.org/formats/oboInOwl#h...,t8857485,CL_0000598,id,CL_0000598,CL,_,598
693347,http://www.geneontology.org/formats/oboInOwl#h...,t8900135,CL_0000740,id,CL_0000740,CL,_,740
693348,http://www.geneontology.org/formats/oboInOwl#h...,t8919740,CL_0001070,id,CL_0001070,CL,_,1070
693349,http://www.geneontology.org/formats/oboInOwl#h...,t8939918,CL_0001070,id,CL_0001070,CL,_,1070
693350,http://www.geneontology.org/formats/oboInOwl#h...,t8854625,CL_0002060,id,CL_0002060,CL,_,2060
693351,http://www.geneontology.org/formats/oboInOwl#h...,t8936546,CL_1001571,id,CL_1001571,CL,_,1001571
704736,http://www.geneontology.org/formats/oboInOwl#h...,t8806170,UBERON_0002313,id,UBERON_0002313,UBERON,_,2313
704737,http://www.geneontology.org/formats/oboInOwl#h...,t8804620,UBERON_0005368,id,UBERON_0005368,UBERON,_,5368


In [25]:
df_uris = df1_obo[df1_obo.ref_type.isin(['uri', 'uri_obo'])].copy()

In [26]:
df_uris['domain'] = df_uris.o.apply(lambda x: re.match('https?://?(.*?)(/|$)', x).group(1))
df_uris['domain2'] = df_uris.domain.apply(lambda x: re.sub('^www?\d?\.', '', x))

In [27]:
df_uris[['o', 'domain2']].groupby('domain2').agg('count').sort_values('o', ascending=False)

Unnamed: 0_level_0,o
domain2,Unnamed: 1_level_1
en.wikipedia.org,17365
orcid.org,8085
langual.org,5592
snomedbrowser.com,4140
linkedlifedata.com,3092
...,...
howmed.net,1
hindawi.com,1
helda.helsinki.fi,1
heartandmetabolism.org,1


In [28]:
df_ids_unknown = df1_obo[(df1_obo.ref_type == 'id') & df1_obo.namespace.apply(lambda x: x.lower() not in prefixes)].copy()
agg_ids_unknown = df_ids_unknown[['o', 'namespace']].groupby('namespace').agg('count')

In [29]:
agg_ids_unknown.o.sum()

3669249

Try Bioregistry for the still unresolved entities

In [30]:
with open('data/bioregistry.yml') as fp:
    bioregistry = yaml.safe_load(fp)

In [31]:
bioregistry_prefixes = dict()
for key, ontology in bioregistry.items():
    bioregistry_prefixes[ontology['preferred_prefix'].lower()] = key

    if 'synonyms' in ontology:
        for syn in ontology['synonyms']:
            bioregistry_prefixes[syn.lower()] = key

In [32]:
df_ids_unknown['ont_id'] = df_ids_unknown['namespace'].apply(lambda x: bioregistry_prefixes.get(x.lower(), ''))

In [33]:
agg_ids_unknown_found = df_ids_unknown[['o', 'ont_id', 'namespace']]\
    .groupby(['ont_id', 'namespace']).agg('count')\
    .reset_index()

In [34]:
df_ids_unknown[['o', 'ont_id']].groupby(['ont_id']).agg('count')

Unnamed: 0_level_0,o
ont_id,Unnamed: 1_level_1
,655934
aeo,3
agricola,1112
arxiv,1
bams,2234
...,...
wormbase,13
wwf.ecoregion,36
ymdb,102
zfa,1


In [35]:
bioregistry['gc']

{'comment': 'see comment here: https://github.com/obophenotype/ncbitaxon/issues/47',
 'contributor': {'email': 'cthoyt@gmail.com',
  'github': None,
  'name': 'Charles Tapley Hoyt',
  'orcid': '0000-0003-4423-4370'},
 'description': 'Genetic code, mitochontrial genetic code, and other linked information to NCBI taxonomy entries.',
 'example': '11',
 'homepage': 'https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html',
 'name': 'Genetic Code',
 'pattern': '^\\d+$',
 'preferred_prefix': 'gc',
 'references': ['https://github.com/obophenotype/ncbitaxon/issues/47'],
 'synonyms': ['gc_id'],
 'uri_format': 'https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG$1'}

In [36]:
df_ids_unknown[df_ids_unknown.ont_id != ''][['o', 'ns_sep']].groupby('ns_sep').agg('count')

Unnamed: 0_level_0,o
ns_sep,Unnamed: 1_level_1
:,2986743
_,26572


Save found errors for further classification

In [37]:
df_ids_unknown[df_ids_unknown.ont_id == ''][['s', 'p', 'o']].copy()\
    .reset_index()\
    .to_json('results/xrefs_unknown_id.json')

In [38]:
df_uris[['s', 'p', 'o']].copy()\
    .reset_index()\
    .to_json('results/xrefs_uri.json')
