In [73]:
import os, sys
import pandas as pd
from rdflib.graph import Graph

# Notebook goal

This notebook explores content in the kegg datasets got during bh17 for data integration in hetionet:
    - disease-drug.n3
    - disease-gene.n3
    - disease-gene.n3
    

In [74]:
# path to input and output
indir = os.getcwd() + '/data'
if not os.path.isdir(indir): os.makedirs(indir)
outdir = os.getcwd() + '/out'
if not os.path.isdir(outdir): os.makedirs(outdir)
    
# find data
sys.path.insert(0, '.')

## Disease-Drug exploration

ID Schemes:
    - Rephetio: disease [doid DOID:1234]; drug [drugbank DB12345]
    - KEGG: disease [ds:H12345]; drug [dr:D12345]

In [75]:
g = Graph()
g.parse("{}/disease_drug.n3".format(indir), format="n3")

# save data in tab format
with open('{}/disease_drug.tsv'.format(outdir), 'w') as f:
    f.write('disease_keggId\tdrug_keggId\n')
    for s,p,o in g:
        f.write('{}\t{}\n'.format(s.rsplit('/',1)[1],o.rsplit('/',1)[1]))
        
# read dataframe for exploration
dsdr_df = pd.read_csv('{}/disease_drug.tsv'.format(outdir), sep='\t')
print('dimensions: {}'.format(dsdr_df.shape))
dsdr_df.head(2)

dimensions: (1584, 2)


Unnamed: 0,disease_keggId,drug_keggId
0,ds:H00284,dr:D00423
1,ds:H00835,dr:D00535


## Disease-Gene exploration

ID Schemes:
    - Rephetio: disease [doid DOID:1234]; gene [entrez id 12345]
    - KEGG: disease [ds:H12345]; gene [hsa:123..n]

In [76]:
g = Graph()
g.parse("{}/disease_hsa.n3".format(indir), format="n3")

# save data in tab format
with open('{}/disease_hsa.tsv'.format(outdir), 'w') as f:
    f.write('disease_keggId\tgene_keggId\n')
    for s,p,o in g:
        f.write('{}\t{}\n'.format(s.rsplit('/',1)[1],o.rsplit('/',1)[1]))
        
# read dataframe for exploration
dshsa_df = pd.read_csv('{}/disease_hsa.tsv'.format(outdir), sep='\t')
print('dimensions: {}'.format(dshsa_df.shape))
dshsa_df.head(2)

dimensions: (5171, 2)


Unnamed: 0,disease_keggId,gene_keggId
0,ds:H00238,hsa:2187
1,ds:H01367,hsa:55687


## Drug-Gene exploration

ID Schemes:
    - Rephetio: drug [drugbank DB12345]; gene [entrez id 12345]
    - KEGG: drug [dr:D12345]; gene [hsa:123..n]

In [77]:
g = Graph()
g.parse("{}/drug_hsa.n3".format(indir), format="n3")

# save data in tab format
with open('{}/drug_hsa.tsv'.format(outdir), 'w') as f:
    f.write('drug_keggId\tgene_keggId\n')
    for s,p,o in g:
        f.write('{}\t{}\n'.format(s.rsplit('/',1)[1],o.rsplit('/',1)[1]))
        
# read dataframe for exploration
drhsa_df = pd.read_csv('{}/drug_hsa.tsv'.format(outdir), sep='\t')
print('dimensions: {}'.format(drhsa_df.shape))
drhsa_df.head(2)

dimensions: (9951, 2)


Unnamed: 0,drug_keggId,gene_keggId
0,dr:D03781,hsa:1588
1,dr:D00720,hsa:1131


## Mapping KEGG IDs to Rephetio ID scheme
KEGG Medicus: http://www.genome.jp/kegg/medicus.html
    
       - drug: KEGG ID (KEGG MEDICUS - DRUG)
       - disease: KEGG ID (KEGG MEDICUS - DISEASE)
       - gene: not necessary (KEGG uses usually NCBI GeneID or INSDC Locus_tag [http://www.kegg.jp/kegg/docs/weblink.html])

## Comparing KEGG and Hetionet v1.0 datasets
| Edges | KEGG | HETIONET | HETIONET SOURCES |
|:------|-----|----------|------------------:|
| Disease-Drug | 1584 | 755 | [MEDI-HPS, LabeledIn, ehrlink, PREDICT] |
| Disease-Gene | 5171 | 12600 | [DISEASES, DOAF, DisGeNET, GWAS] |
| Drug-Gene | 9951 | 24600 | [DrugCentral, BindingDB, DrugBank] |
