# Libraries and function calls

In [133]:
from wikidataintegrator import wdi_core, wdi_login
import os
from rdflib import Graph, URIRef
import pandas as pd
import copy
from datetime import datetime

In [134]:
def createDOReference(doid):
    statedin = wdi_core.WDItemID("Q5282129", prop_nr="P248", is_reference=True)
    retrieved = datetime.now()
    timeStringNow = retrieved.strftime("+%Y-%m-%dT00:00:00Z")
    refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
    doid = wdi_core.WDExternalID(doid, prop_nr="P699", is_reference=True)
    return [statedin, refRetrieved, doid]

def createIORef():
    statedin = wdi_core.WDItemID("Q16335166", prop_nr="P248", is_reference=True)
    referenceURL = wdi_core.WDUrl("https://registry.identifiers.org/registry/doid", prop_nr="P854", is_reference=True)
    return [statedin, referenceURL]

# Loading the Disease Ontology from the source

In [135]:
print("\nDownloading the Disease Ontology...")
url = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/releases/2020-11-11/doid.owl"

doGraph = Graph()
doGraph.parse(url, format="xml")


Downloading the Disease Ontology...


<Graph identifier=N43a309b2e9314aa6ae08567c111d5e9b (<class 'rdflib.graph.Graph'>)>

In [136]:
df_doNative = pd.DataFrame(columns=["do_uri", "doid", "label"])

qres = doGraph.query(
    """
       PREFIX obo: <http://www.geneontology.org/formats/oboInOwl#>
       PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
       PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

       SELECT DISTINCT ?do_uri ?doid ?label 
       WHERE {
          ?do_uri obo:id ?doid ;
                  rdfs:label ?label .

       } """)

for row in qres:
    df_doNative = df_doNative.append({
     "do_uri": str(row[0]),
     "doid": str(row[1]),
     "label":  str(row[2]),
      }, ignore_index=True)

In [137]:
df_doNative

Unnamed: 0,do_uri,doid,label
0,http://purl.obolibrary.org/obo/DOID_0110070,DOID:0110070,arrhythmogenic right ventricular dysplasia 1
1,http://purl.obolibrary.org/obo/DOID_3667,DOID:3667,obsolete Cardiovirus infectious disease
2,http://purl.obolibrary.org/obo/DOID_0080478,DOID:0080478,peroxisome biogenesis disorder 3A
3,http://purl.obolibrary.org/obo/DOID_0050826,DOID:0050826,tricuspid valve disease
4,http://purl.obolibrary.org/obo/DOID_933,DOID:933,obsolete Cestoda infectious disease
...,...,...,...
12989,http://purl.obolibrary.org/obo/DOID_0050654,DOID:0050654,Baller-Gerold syndrome
12990,http://purl.obolibrary.org/obo/DOID_0050577,DOID:0050577,cranioectodermal dysplasia
12991,http://purl.obolibrary.org/obo/DOID_0060281,DOID:0060281,photosensitive epilepsy
12992,http://purl.obolibrary.org/obo/DOID_0070020,DOID:0070020,autosomal dominant dyskeratosis congenita 4


# Loading Disease Ontology from Wikidata

In [None]:
query = """
  SELECT DISTINCT ?disease ?doid WHERE {?disease  wdt:P699 ?doid .}
"""
df_wd = wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)

doQids = {}
inwikidata=wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
for index, row in inwikidata.iterrows():
    doQids[row["doid"]] = row["disease"]

QidsDo = dict()
for key in doQids.keys():
    QidsDo[doQids[key]] = key
QidsDo
doQids

In [None]:
newItems = set(df_doNative["doid"].tolist())-set(df_wd["doid"].tolist())
missing = df_doNative[df_doNative["doid"].isin(newItems)]

In [None]:
chunks = df_doNative[df_doNative["doid"].isin(newItems)]["label"].to_list()

pages = 50
loops = [chunks[i:i+pages] for i in range(0, len(chunks), pages)]
loops

t =1 

frames = []
for run in loops:
    values = "VALUES ?label {\"" + "\"@en \"".join(run)+"\"} VALUES ?altLabel {\"" + "\"@en \"".join(run)+"\"}"
    sparql = "SELECT DISTINCT ?disease ?label (GROUP_CONCAT(DISTINCT ?typeLabel) as ?types) (GROUP_CONCAT(DISTINCT ?sclassLabel) as ?subclasses) WHERE {"
    sparql += values 
    sparql += """  

    ?disease rdfs:label ?label .
    OPTIONAL {?disease wdt:P31 ?type.  ?type rdfs:label ?typeLabel . FILTER (LANG(?typeLabel) = "en")}    
    OPTIONAL {?disease wdt:P279 ?sclass .  ?sclass rdfs:label ?sclassLabel . FILTER (LANG(?sclassLabel) = "en")}                                                             
  } GROUP BY ?disease ?label"""
    frames.append(wdi_core.WDFunctionsEngine.execute_sparql_query(sparql, as_dataframe=True))
    print(str(t)+"/"+str(len(loops)))
    t+=1

print(sparql)
    
results = pd.concat(frames)
results

In [None]:
pd.set_option('display.max_rows', 10)
results = pd.merge(left=missing, right=df_doNative, how='left', left_on='label', right_on='label')
    query = """
    SELECT ?symptom ?soid WHERE {
       ?symptom wdt:P8656 ?soid .
    }
    """
    soQids = {}
    inwikidata=wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
    for index, row in inwikidata.iterrows():
        soQids["http://purl.obolibrary.org/obo/SYMP_"+row["soid"]] = row["symptom"]results

In [None]:
query = """
    SELECT ?symptom ?soid WHERE {
       ?symptom wdt:P8656 ?soid .
    }
    """
    soQids = {}
    inwikidata=wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
    for index, row in inwikidata.iterrows():
        soQids["http://purl.obolibrary.org/obo/SYMP_"+row["soid"]] = row["symptom"]

for index, row in df_doNative.iterrows():
    doid = row["doid"]
    do_reference = createDOReference(doid)
    identorg_reference = createIORef()
    tuple = df_doNative[df_doNative["doid"]==doid]
    dorow = tuple.iloc[0]
    statements = []
    # Disease Ontology ID (P31) 
    statements.append(wdi_core.WDString(value=dorow["doid"], prop_nr="P699", references=[copy.deepcopy(do_reference)]))
    # exact match (P2888)
    statements.append(wdi_core.WDUrl(value=dorow["do_uri"], prop_nr="P2888", references=[copy.deepcopy(do_reference)]))
    # identifiers.org URI
    statements.append(wdi_core.WDUrl("http://identifiers.org/doid/"+dorow["doid"], prop_nr="P2888", references=[copy.deepcopy(identorg_reference)]))
    uri = str(dorow["do_uri"])

    query="""PREFIX obo: <http://www.geneontology.org/formats/oboInOwl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

        SELECT * WHERE {"""
    query+= "<"+uri+"> rdfs:subClassOf [ owl:onProperty doid:has_symptom ; owl:someValuesFrom ?symptom ] .} "

    for row in doGraph.query(query):
        #print(soQids[str(row[0])])
        statements.append(wdi_core.WDItemID(value=soQids[str(row[0])].replace("http://www.wikidata.org/entity/", ""),
                                          prop_nr="P780", references=[copy.deepcopy(do_reference)]))
    query="""PREFIX obo: <http://www.geneontology.org/formats/oboInOwl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

        SELECT ?subClassOf WHERE {"""
    query+= "<"+uri+"> rdfs:subClassOf ?subClassOf .  FILTER (REGEX(str(?subClassOf), 'http', 'i'))} "

    for row in doGraph.query(query):
        if row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:") not in doQids.keys():
            doQids[row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:")] = create(row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:"))
        statements.append(wdi_core.WDItemID(value=doQids[row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:")].replace("http://www.wikidata.org/entity/", ""),
                                           prop_nr="P279", references=[copy.deepcopy(do_reference)]))
    query="""PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?exactMatch WHERE {"""
    query+= "<"+uri+"> skos:exactMatch ?exactMatch .}"
    for row in doGraph.query(query):
        extID = row[0]
        if "MESH:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"].replace("MESH", ""), prop_nr="P486", references=[copy.deepcopy(do_reference)]))
        if "NCI:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P1748", references=[copy.deepcopy(do_reference)]))
        if "ICD10CM:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P4229", references=[copy.deepcopy(do_reference)]))
        if "UMLS_CUI:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P2892", references=[copy.deepcopy(do_reference)]))

    item = wdi_core.WDItemEngine(wd_item_id=doQids[doid].replace("http://www.wikidata.org/entity/", ""), data=statements, keep_good_ref_statements=True)

    if item.get_label() == "":
        item.set_label(dorow["label"], lang="en")
        if item.get_description() == "":
            item.set_description("human disease", lang="en")
    elif item.get_label() != dorow["label"]:
        aliases = item.get_aliases()
        if dorow["label"] not in aliases:
            aliases.append(dorow["label"])
            item.set_aliases(aliases)

    item.get_wd_json_representation()

    print(item.write(login))



In [None]:

WDUSER = "<user name>"
WDPASS = "<password>"

login = wdi_login.WDLogin(WDUSER, WDPASS)

In [None]:
doGraph.serialize(destination="/tmp/leesdit.ttl", format="turtle")

In [None]:
item = wdi_core.WDItemEngine(new_item=True, data=statements, keep_good_ref_statements=True)
if item.get_label() == "":
    item.set_label(dorow["label"], lang="en")
    if item.get_description() == "":
        item.set_description("human disease", lang="en")
elif item.get_label() != dorow["label"]:
    aliases = item.get_aliases()
    if dorow["label"] not in aliases:
        aliases.append(dorow["label"])
        item.set_aliases(aliases)

item.get_wd_json_representation()

item.write(login)

In [None]:
item = wdi_core.WDItemEngine(wd_item_id="Q293533", data=statements, keep_good_ref_statements=True)
if item.get_label() == "":
    item.set_label(dorow["label"], lang="en")
    if item.get_description() == "":
        item.set_description("human disease", lang="en")
elif item.get_label() != dorow["label"]:
    aliases = item.get_aliases()
    if dorow["label"] not in aliases:
        aliases.append(dorow["label"])
        item.set_aliases(aliases)

item.get_wd_json_representation()

item.write(login)

In [None]:
def create(doid):
    do_reference = createDOReference(doid)
    identorg_reference = createIORef()
    tuple = df_doNative[df_doNative["doid"]==doid]
    dorow = tuple.iloc[0]
    statements = []
    # Disease Ontology ID (P31) 
    statements.append(wdi_core.WDString(value=dorow["doid"], prop_nr="P699", references=[copy.deepcopy(do_reference)]))
    # exact match (P2888)
    statements.append(wdi_core.WDUrl(value=dorow["do_uri"], prop_nr="P2888", references=[copy.deepcopy(do_reference)]))
    # identifiers.org URI
    statements.append(wdi_core.WDUrl("http://identifiers.org/doid/"+dorow["doid"], prop_nr="P2888", references=[copy.deepcopy(identorg_reference)]))
    uri = str(dorow["do_uri"])
    query = """
    SELECT ?symptom ?soid WHERE {
       ?symptom wdt:P8656 ?soid .
    }
    """
    soQids = {}
    inwikidata=wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
    for index, row in inwikidata.iterrows():
        soQids["http://purl.obolibrary.org/obo/SYMP_"+row["soid"]] = row["symptom"]
    query="""PREFIX obo: <http://www.geneontology.org/formats/oboInOwl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

        SELECT * WHERE {"""
    query+= "<"+uri+"> rdfs:subClassOf [ owl:onProperty doid:has_symptom ; owl:someValuesFrom ?symptom ] .} "

    for row in doGraph.query(query):
        print(soQids[str(row[0])])
        statements.append(wdi_core.WDItemID(value=soQids[str(row[0])].replace("http://www.wikidata.org/entity/", ""),
                                          prop_nr="P780", references=[copy.deepcopy(do_reference)]))
    query="""PREFIX obo: <http://www.geneontology.org/formats/oboInOwl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

        SELECT ?subClassOf WHERE {"""
    query+= "<"+uri+"> rdfs:subClassOf ?subClassOf .  FILTER (REGEX(str(?subClassOf), 'http', 'i'))} "

    for row in doGraph.query(query):
            if row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:") not in doQids.keys():
                doQids[row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:")] = create(row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:"))
            statements.append(wdi_core.WDItemID(value=doQids[row[0].replace("http://purl.obolibrary.org/obo/DOID_", "DOID:")].replace("http://www.wikidata.org/entity/", ""),
                                               prop_nr="P279", references=[copy.deepcopy(do_reference)]))
    query="""PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?exactMatch WHERE {"""
    query+= "<"+uri+"> skos:exactMatch ?exactMatch .}"
    for row in doGraph.query(query):
        extID = row[0]
        if "MESH:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"].replace("MESH", ""), prop_nr="P486", references=[copy.deepcopy(do_reference)]))
        if "NCI:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P1748", references=[copy.deepcopy(do_reference)]))
        if "ICD10CM:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P4229", references=[copy.deepcopy(do_reference)]))
        if "UMLS_CUI:" in extID:
            statements.append(wdi_core.WDExternalID(row["exactMatch"], prop_nr="P2892", references=[copy.deepcopy(do_reference)]))

    item = wdi_core.WDItemEngine(data=statements, keep_good_ref_statements=True)

    if item.get_label() == "":
        item.set_label(dorow["label"], lang="en")
        if item.get_description() == "":
            item.set_description("human disease", lang="en")
    elif item.get_label() != dorow["label"]:
        aliases = item.get_aliases()
        if dorow["label"] not in aliases:
            aliases.append(dorow["label"])
            item.set_aliases(aliases)

    item.get_wd_json_representation()

    return item.write(login)

In [None]:
for index, row in missing.iterrows():
    print(row["doid"])
    print(create(row["doid"]))

In [None]:
len(missing)

In [None]:
missing["doid"]

In [None]:
wdi_core