In [1]:
import shutil
import wget
import bz2
import sys
import json
import os
import xmltodict
import pickle
import gzip
import rdflib
import requests
from clint.textui import progress
from collections import defaultdict
import zipfile
from rdflib import Namespace
from rdflib.term import URIRef
from os.path import exists
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
from urllib.parse import quote
from tqdm import tqdm

#create this bar_progress method which is invoked automatically from wget
def download(from_, to_, headers_=None, data_=None):
    r = requests.post(from_, stream=True, headers=headers_, data=data_)
    with open(to_, "wb") as out:
        if "content-length" not in r.headers:
            for ch in r.iter_content(chunk_size = 2391975):
                if ch:
                    out.write(ch)
        else:
            total_length = int(r.headers.get('content-length'))
            for ch in progress.bar(r.iter_content(chunk_size = 2391975), expected_size=(total_length/1024) + 1):
                if ch:
                    out.write(ch)

  # Methods
def read_nodes(lst):
    nodes = set()
    for path in lst:
        content = None
        with open(path, encoding="utf8") as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        for line in content:
            head,rel,tail = line.split("\t")
            nodes.add(head)
            nodes.add(tail)
    return nodes

def read_line(path, skip_first):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            c += 1
            if skip_first and c == 0:
                continue
            yield line

def divide_chunks(l, n):
    l = list(l)
    for i in range(0, len(l), n): 
        yield l[i:i + n]

# Downloads and loading dataset

In [32]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

if not exists('cache'):
    os.makedirs('cache')

if not exists('cache/biokg.zip'):
    url = "http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip"
    download(url, 'cache/biokg.zip')
    import zipfile
    with zipfile.ZipFile('cache/biokg.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')

if not exists('cache/data.zip'):
    url = "https://github.com/OpenBioLink/Utilities/raw/main/data/ogbl-biokg/data.zip"
    download(url, 'cache/data.zip')
    import zipfile
    with zipfile.ZipFile('cache/data.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')

nodes = read_nodes([ train_path, test_path, valid_path ])
typednodes = defaultdict(set)
for node in nodes:
    typ, idx = node.split(":")
    typednodes[typ].add(idx)

if not exists('cache/gene.json'):
    results = []
    for x in tqdm(divide_chunks(typednodes["protein"], 1000), total=int(len(typednodes["protein"])/1000)):
        ids_str = ",".join(x)
        response = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&rettype=docsum", data={'id':f'{ids_str}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
        data_dict = xmltodict.parse(response.text)
        results.extend(data_dict["eSummaryResult"]["DocumentSummarySet"]["DocumentSummary"])


    with open(f"cache/gene.json", "w") as out:
        json.dump(results, out, indent = 4)

if not exists('cache/drug.json'):
    results = []
    for x in tqdm(divide_chunks(typednodes["drug"], 10), total=int(len(typednodes["drug"])/10)):
        ids_str = ",".join(x)
        response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{ids_str}/description/JSON")
        results.extend(response.json().get("InformationList", {}).get("Information", []))

    with open(f"cache/drug.json", "w") as out:
        json.dump(results, out, indent = 4)

In [34]:

results = []
for x in tqdm(divide_chunks(typednodes["drug"], 100), total=int(len(typednodes["drug"])/100)):
    ids_str = ",".join(x)
    print(ids_str)
    response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{ids_str}/description/JSON")
    results.extend(response.json().get("InformationList", {}).get("Information", []))

with open(f"cache/drugs.json", "w") as out:
    json.dump(results, out, indent = 4)

  0%|          | 0/105 [00:00<?, ?it/s]

7527,8781,7936,3784,2962,3518,8078,3833,3130,6612,2227,9706,1975,3347,3041,1345,1148,675,8280,7230,1773,7798,2946,10012,9142,1809,7492,6736,5814,988,6501,7085,8366,3766,1867,8660,3214,6785,3995,3405,2315,6397,1047,4371,3753,1482,5608,10331,9440,3008,8414,2441,2126,8603,4490,1992,7177,9173,6884,8379,779,2209,2150,6538,7219,6921,6649,7066,3187,544,8011,7995,1131,10296,3891,6414,7888,9153,1375,5367,8634,8447,1280,10105,8710,7862,8399,3408,1182,5896,391,479,7130,4938,9556,9780,9652,5242,5091,7779


  1%|          | 1/105 [00:11<20:23, 11.76s/it]

6462,452,2131,8715,7642,5123,4066,6653,7809,3727,7489,1374,4551,9598,6261,9180,7727,9538,6157,8119,8304,2567,9189,3208,9397,1228,8866,4646,10481,6747,9114,5768,2089,4893,9446,3521,6494,695,446,2979,5064,8927,5572,7518,4385,7982,10277,7684,5061,5752,6780,2716,9131,7691,1363,9763,2388,7851,5463,8680,2808,1904,4189,6411,5095,2062,2894,1216,6172,6396,2611,671,2498,777,3155,3283,5021,190,4563,161,9827,8585,8923,359,276,7508,4077,10153,8001,9386,2776,9023,5128,4693,3895,1430,1468,8713,2752,7499


KeyboardInterrupt: 

# Preprocessing

After preprocessing the dictonary `values` contains 5 keys for each type `"disease", "drug", "function", "protein", "sideeffect"`. Each contains another dictionary that maps Ids to Tuple (Label, Description, wwwresource).

In [3]:
values = {}

## Gene

In [4]:
with open('cache/gene.json', 'r') as infile:
    data=infile.read()
genes_ = json.loads(data)

values["protein"] = {}
for gene in genes_:
    try:
        if "error" not in gene:
            uid = gene["@uid"]
            label = gene["Name"]
            if gene["Description"] is not None:
                label = ": " + gene["Description"]
            description = gene["Summary"]
            wwwresource = "http://identifiers.org/NCBIGENE:" + uid
            values["protein"][uid] = (label, description, wwwresource)
    except Exception as e:
        print(e)
        print(gene)
        raise

## Disease

In [5]:

from owlready2 import *

doid_world = World()
try:
       onto = doid_world.get_ontology("http://purl.obolibrary.org/obo/doid.owl").load()
except OwlReadyOntologyParsingError:
       # owlready2 throws weird error on import in owl file, ignoring works fine for this usecase :)
       pass

r = list(doid_world.sparql("""
prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> 
SELECT *
       WHERE {
       ?x rdfs:label ?label .
       ?x oboInOwl:hasDbXref ?ref .
       OPTIONAL {  ?x obo:IAO_0000115 ?descr }
       }
"""))

values["disease"] = {}
for line in r:
       if line[2].startswith("UMLS_CUI:"):
              uid = line[2].replace("UMLS_CUI:", "")
              values["disease"][uid] = (line[1], line[3], "http://identifiers.org/" + str(line[0]).replace("_",":"))



## Function

In [6]:
from owlready2 import *

go_world = World()
try:
       onto = go_world.get_ontology("http://purl.obolibrary.org/obo/go.owl").load()
except OwlReadyOntologyParsingError:
       # owlready2 throws weird error on import in owl file, ignoring works fine for this usecase :)
       pass

r = list(go_world.sparql("""
       SELECT *
       WHERE {
       ?x rdfs:label ?label .
       OPTIONAL {  ?x obo:IAO_0000115 ?descr }
       }
"""))

values["function"] = {}
for line in r:
       if str(line[0]).startswith("obo.GO_"):
              uid = str(line[0]).replace("obo.GO_","GO:")
              values["function"][uid] = (line[1], line[2], "http://identifier.org/" + uid)


## Drug

In [24]:
with open('cache/drug.json', 'r') as infile:
    data=infile.read()
drugs_ = json.loads(data)

title_ = {}
descr_ = {}
for drug in drugs_:
    if "Title" in drug:
        title_[drug["CID"]] = drug["Title"]
    elif "Description" in drug:
        descr_[drug["CID"]] = drug["Description"]
    else:
        print("ALARM ..., ALARM!")
        print(drug)


values["drug"] = {}
for cid in title_.keys():
    uid = str(cid)
    values["drug"]["CID" + uid.rjust(9, '0')] = (title_[cid], descr_.get(cid, None), "http://identifiers.org/PUBCHEM.COMPOUND:" + uid)

## Sideeffects

In [8]:
from owlready2 import *

hp_world = World()
try:
       onto = hp_world.get_ontology("https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.owl").load()
except OwlReadyOntologyParsingError:
       # owlready2 throws weird error on import in owl file, ignoring works fine for this usecase :)
       pass

r = list(hp_world.sparql("""
prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> 
SELECT *
       WHERE {
       ?x rdfs:label ?label .
       ?x oboInOwl:hasDbXref ?ref .
       OPTIONAL {  ?x obo:IAO_0000115 ?descr }
       }
"""))

values["sideeffect"] = {}
for line in r:
       if line[2].startswith("UMLS:"):
              uid = line[2].replace("UMLS:","")
              values["sideeffect"][uid] = (line[1], line[3], "http://identifiers.org/" + str(line[0]).replace("_",":"))

# Graph generation

In [9]:
ai = rdflib.Namespace("http://ai-strategies.org/ns/")

g = rdflib.Graph()

g.bind("ai", ai)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [30]:

for type_ in ["disease", "drug", "function", "protein", "sideeffect"]:
    print(type_)
    c = 0
    g = 0
    with open(f"cache/missing_{type_}.txt", "w") as outf:
        with gzip.open(f'cache/biokg/mapping/{type_}_entidx2name.csv.gz', 'rt') as f:
            content = f.readlines()
        content = [x.strip() for x in content[1:]]
        for line in content:
            idx, identifier = line.split(",")
            id = f"{type_}:{idx}"
            if identifier not in values[type_]:
                #print("ALARM ..., ALARM!!!", identifier)
                c = c + 1
                outf.write(identifier + "\n")
            else:
                g = g + 1
        print(g)
        print(c)

disease
2366
8321
drug
1842
8691
function
41768
3317
protein
7655
9844
sideeffect
1062
8907


In [22]:
for type_ in ["disease", "drug", "function", "protein", "sideeffect"]:
    print(type_)
    with gzip.open(f'cache/biokg/mapping/disease_entidx2name.csv.gz', 'rt') as f:
        content = f.readlines()
    content = [x.strip() for x in content[1:]]
    for line in content:
        idx, identifier = line.split(",")
        id = f"{type_}:{idx}"
        
        if identifier not in values[type_]:
            print("ALARM ..., ALARM!!!", identifier)
            continue

        name, description, url = values[type_][identifier]

        g.add((
            ai.term(quote(id)),
            RDFS.label,
            rdflib.Literal(name, datatype=XSD.string)

        ))
        g.add((
            ai.term(quote(id)),
            RDF.type,
            rdflib.Literal(type_, datatype=XSD.string)
        ))
        g.add((
            ai.term(quote(id)),
            ai.wwwresource,
            rdflib.Literal(url, datatype=XSD.string)
        ))
        if description is not None:
            g.add((
                ai.term(quote(id)),
                RDFS.comment,
                rdflib.Literal(description, datatype=XSD.string)
            ))


# Relations
with gzip.open('cache/mapping/relidx2relname.csv.gz', 'rb') as f:
    content = f.read()
content = [x.strip() for x in content[1:]]
for line in content:
    id, rel_label = line.split(",")
    g.add((
        ai.term(quote(id)),
        RDFS.label,
        rdflib.Literal(rel_label, datatype=XSD.string)
    ))


disease
ALARM ..., ALARM!!! C0000737
ALARM ..., ALARM!!! C0000768
ALARM ..., ALARM!!! C0000771
ALARM ..., ALARM!!! C0000772
ALARM ..., ALARM!!! C0000786
ALARM ..., ALARM!!! C0000809
ALARM ..., ALARM!!! C0000810
ALARM ..., ALARM!!! C0000814
ALARM ..., ALARM!!! C0000822
ALARM ..., ALARM!!! C0000832
ALARM ..., ALARM!!! C0001122
ALARM ..., ALARM!!! C0001127
ALARM ..., ALARM!!! C0001144
ALARM ..., ALARM!!! C0001193
ALARM ..., ALARM!!! C0001231
ALARM ..., ALARM!!! C0001263
ALARM ..., ALARM!!! C0001264
ALARM ..., ALARM!!! C0001338
ALARM ..., ALARM!!! C0001349
ALARM ..., ALARM!!! C0001364
ALARM ..., ALARM!!! C0001546
ALARM ..., ALARM!!! C0001623
ALARM ..., ALARM!!! C0001627
ALARM ..., ALARM!!! C0001723
ALARM ..., ALARM!!! C0001787
ALARM ..., ALARM!!! C0001849
ALARM ..., ALARM!!! C0001883
ALARM ..., ALARM!!! C0001890
ALARM ..., ALARM!!! C0001925
ALARM ..., ALARM!!! C0001956
ALARM ..., ALARM!!! C0001957
ALARM ..., ALARM!!! C0001969
ALARM ..., ALARM!!! C0001973
ALARM ..., ALARM!!! C0002063
ALARM 

In [None]:

g.serialize(os.path.abspath(r"ogbl-biokg.ttl"),format="turtle")

## Add dataset edges to graph

In [None]:
outfile = open(r"hetionet.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://ai-strategies.org/ns/{quote(head, safe='@~():/')}> <http://ai-strategies.org/ns/{quote(rel, safe='@~():/'))}> <http://ai-strategies.org/ns/{quote(tail, safe='@~():/'))}>>> ai:split ai:{typ} . " + "\n")


read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [None]:
zipObj = zipfile.ZipFile('data.zip', 'w')
zipObj.write('ogbl-biokg.ttl', 'ogbl-biokg.ttl', zipfile.ZIP_DEFLATED)
zipObj.close()