# DRKG TigerGraph Schema and Import

## Connecting

Make sure you install pyTigerGraph (`pip install pyTigerGraph`) as well as Pandas and Numpy (`conda install pandas numpy`).

In [None]:
import pyTigerGraph as tg
import numpy as np
import pandas as pd

In [128]:
host = 'https://your-organization.i.tgcloud.io'
secret = "your-secret"
graph_name = "drkg"
user_name = "tigergraph"
password = "your-password"
token = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password).getToken(secret, "1000000")[0]
conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password, apiToken=token)

In [129]:
conn.echo()

'Hello GSQL'

## Schema

In [17]:
import pandas as pd
drkg_file = './drkg.tsv'
df = pd.read_csv(drkg_file, sep="\t",header = None)
triplets = df.values.tolist()

In [84]:
len(triplets)

5874260

In [48]:
rtypes = dict() # edge types per entity-couple
entity_dic = {} # entities organized per type
for triplet in triplets:
    [h,r,t] = triplet
    h_type = h.split("::")[0].replace(" " ,"")
    h_id = str(h.split("::")[1])
    t_type = t.split("::")[0].replace(" " ,"")
    t_id = str(t.split("::")[1])
    
    # add the type if not present
    if not h_type in entity_dic:
        entity_dic[h_type]={}
    if not t_type in entity_dic:
        entity_dic[t_type] ={}
    
    # add the edge type per type couple
    type_edge = f"{h_type}::{t_type}"
    if not type_edge in rtypes:
        rtypes[type_edge]=[]
    r = r.replace(" ","").replace(":","").replace("+","").replace(">","").replace("-","")
    if not r in rtypes[type_edge]:
        rtypes[type_edge].append(r)
    
    # spread entities
    if not h_id in entity_dic[h_type]:
        entity_dic[h_type][h_id] = h
    if not t in entity_dic[t_type]:
        entity_dic[t_type][t_id] = t
    
schema = ""
for entity_type in entity_dic.keys():
    schema += f"CREATE VERTEX {entity_type} (PRIMARY_ID Id STRING) With primary_id_as_attribute=\"true\"\n"
for endpoints in rtypes:
    [source_name, target_name] = endpoints.split("::")
    for edge_name in rtypes[endpoints]:
        schema += f"CREATE DIRECTED EDGE {edge_name} (FROM {source_name}, TO {target_name})\n"
print(schema)

CREATE VERTEX Gene (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Compound (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Disease (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Atc (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Tax (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX BiologicalProcess (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Symptom (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Anatomy (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX MolecularFunction (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX PharmacologicClass (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX CellularComponent (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Pathway (PRIMARY_ID Id STRING) With primary_id_as_attribute="tru

In [49]:
print(conn.gsql(
"""
use global
CREATE VERTEX Gene (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Compound (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Disease (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Atc (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Tax (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX BiologicalProcess (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Symptom (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Anatomy (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX MolecularFunction (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX PharmacologicClass (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX CellularComponent (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Pathway (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX SideEffect (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE DIRECTED EDGE bioarxHumGenHumGenGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE bioarxVirGenHumGenGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRVGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRQGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRRgGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRBGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRIGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBREGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRHGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE GNBRWGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE HetionetGiGGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE HetionetGcGGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE HetionetGrGGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTASSOCIATIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTPHYSICALASSOCIATIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTCOLOCALIZATIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTDEPHOSPHORYLATIONREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTCLEAVAGEREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTDIRECTINTERACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTPHOSPHORYLATIONREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTADPRIBOSYLATIONREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTUBIQUITINATIONREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE INTACTPROTEINCLEAVAGEGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGREACTIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGCATALYSISGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGACTIVATIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGINHIBITIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGOTHERGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGBINDINGGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGPTMODGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE STRINGEXPRESSIONGeneGene (FROM Gene, TO Gene)
CREATE DIRECTED EDGE bioarxDrugVirGenCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE bioarxDrugHumGenCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE DRUGBANKtargetCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE DRUGBANKenzymeCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE DRUGBANKcarrierCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRECompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRACompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRNCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRKCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRBCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBROCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE GNBRZCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE HetionetCbGCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE HetionetCuGCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE HetionetCdGCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE INTACTPHYSICALASSOCIATIONCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE INTACTDIRECTINTERACTIONCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE INTACTASSOCIATIONCompoundGene (FROM Compound, TO Gene)
CREATE DIRECTED EDGE bioarxCovid2_acc_host_geneDiseaseGene (FROM Disease, TO Gene)
CREATE DIRECTED EDGE bioarxCoronavirus_ass_host_geneDiseaseGene (FROM Disease, TO Gene)
CREATE DIRECTED EDGE HetionetDdGDiseaseGene (FROM Disease, TO Gene)
CREATE DIRECTED EDGE HetionetDaGDiseaseGene (FROM Disease, TO Gene)
CREATE DIRECTED EDGE HetionetDuGDiseaseGene (FROM Disease, TO Gene)
CREATE DIRECTED EDGE DGIDBINHIBITORGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBANTAGONISTGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBOTHERGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBAGONISTGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBBINDERGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBMODULATORGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBBLOCKERGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBCHANNELBLOCKERGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBANTIBODYGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBPOSITIVEALLOSTERICMODULATORGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBALLOSTERICMODULATORGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBACTIVATORGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DGIDBPARTIALAGONISTGeneCompound (FROM Gene, TO Compound)
CREATE DIRECTED EDGE DRUGBANKxatcCompoundAtc (FROM Compound, TO Atc)
CREATE DIRECTED EDGE DRUGBANKddiinteractorinCompoundCompound (FROM Compound, TO Compound)
CREATE DIRECTED EDGE HetionetCrCCompoundCompound (FROM Compound, TO Compound)
CREATE DIRECTED EDGE DRUGBANKtreatsCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRTCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRCCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRSaCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRPaCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRMpCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRPrCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRJCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE HetionetCtDCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE HetionetCpDCompoundDisease (FROM Compound, TO Disease)
CREATE DIRECTED EDGE GNBRLGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRUGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRYGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRJGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRTeGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRMdGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRGGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRDGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRXGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRUdGeneDisease (FROM Gene, TO Disease)
CREATE DIRECTED EDGE GNBRin_taxGeneTax (FROM Gene, TO Tax)
CREATE DIRECTED EDGE HetionetGpBPGeneBiologicalProcess (FROM Gene, TO BiologicalProcess)
CREATE DIRECTED EDGE HetionetDpSDiseaseSymptom (FROM Disease, TO Symptom)
CREATE DIRECTED EDGE HetionetDlADiseaseAnatomy (FROM Disease, TO Anatomy)
CREATE DIRECTED EDGE HetionetDrDDiseaseDisease (FROM Disease, TO Disease)
CREATE DIRECTED EDGE HetionetAdGAnatomyGene (FROM Anatomy, TO Gene)
CREATE DIRECTED EDGE HetionetAuGAnatomyGene (FROM Anatomy, TO Gene)
CREATE DIRECTED EDGE HetionetAeGAnatomyGene (FROM Anatomy, TO Gene)
CREATE DIRECTED EDGE HetionetGpMFGeneMolecularFunction (FROM Gene, TO MolecularFunction)
CREATE DIRECTED EDGE HetionetPCiCPharmacologicClassCompound (FROM PharmacologicClass, TO Compound)
CREATE DIRECTED EDGE HetionetGpCCGeneCellularComponent (FROM Gene, TO CellularComponent)
CREATE DIRECTED EDGE HetionetGpPWGenePathway (FROM Gene, TO Pathway)
CREATE DIRECTED EDGE HetionetCcSECompoundSideEffect (FROM Compound, TO SideEffect)
"""

))


Successfully created vertex types: [Gene].
Successfully created vertex types: [Compound].
Successfully created vertex types: [Disease].
Successfully created vertex types: [Atc].
Successfully created vertex types: [Tax].
Successfully created vertex types: [BiologicalProcess].
Successfully created vertex types: [Symptom].
Successfully created vertex types: [Anatomy].
Successfully created vertex types: [MolecularFunction].
Successfully created vertex types: [PharmacologicClass].
Successfully created vertex types: [CellularComponent].
Successfully created vertex types: [Pathway].
Successfully created vertex types: [SideEffect].
Successfully created edge types: [bioarxHumGenHumGenGeneGene].
Successfully created edge types: [bioarxVirGenHumGenGeneGene].
Successfully created edge types: [GNBRVGeneGene].
Successfully created edge types: [GNBRQGeneGene].
Successfully created edge types: [GNBRRgGeneGene].
Successfully created edge types: [GNBRBGeneGene].
Successfully created edge types: [GNBRIGe

## Data

Going full-scale leads to timeout and things, of course.
So, for demo purposes we sample from each and add some in this way. Since we use an upsert this will not duplicate any entity (the id is a primary key).

In [72]:
for entity_type in entity_dic:
    sample = np.random.choice(np.array(list(entity_dic[entity_type].values())),2000)
    for id in sample:        
        conn.upsertVertex(entity_type, id ,{})

KeyboardInterrupt: 

In [77]:
triple_count = len(triplets)
sample = np.random.choice(np.arange(triple_count),5000)
for i in sample:
    [h,r,t] = triplets[i]
    h_type = h.split("::")[0].replace(" " ,"")
    h_id = str(h.split("::")[1])
    t_type = t.split("::")[0].replace(" " ,"")
    t_id = str(t.split("::")[1])
    r = r.replace(" ","").replace(":","").replace("+","").replace(">","").replace("-","")
    
    conn.upsertEdge(h_type, h_id, r, t_type, t_id)

In [62]:
for entity_type in entity_dic:
    print(entity_type, len(entity_dic[entity_type]))

Gene 39220
Compound 24313
Disease 5103
Atc 4048
Tax 215
BiologicalProcess 11381
Symptom 415
Anatomy 400
MolecularFunction 2884
PharmacologicClass 345
CellularComponent 1391
Pathway 1822
SideEffect 5701


### Pushing specific diseases


In [134]:
disease_triples = []
for triple in triplets:
    [h,r,t] = triple
    if h=="Disease::SARS-CoV2 M" or t == "Disease::SARS-CoV2 M":
        disease_triples.append(triple)
print(len(disease_triples))        

30


In [137]:
for triple in disease_triples:
    [h,r,t] = triple
    h_type = h.split("::")[0].replace(" " ,"")
    h_id = str(h.split("::")[1])
    t_type = t.split("::")[0].replace(" " ,"")
    t_id = str(t.split("::")[1])
    r = r.replace(" ","").replace(":","").replace("+","").replace(">","").replace("-","")
    
    conn.upsertEdge(h_type, h_id, r, t_type, t_id)

In [None]:
print(conn.gsql(""))

## Fetching some data

In [86]:
df = conn.getVertexDataframe("Gene", limit=10)
df.head()

Unnamed: 0,v_id,Id
0,Gene::811,Gene::811
1,Gene::5104,Gene::5104
2,Gene::2103,Gene::2103
3,Gene::50,Gene::50
4,Gene::6132,Gene::6132


## From multi-graph to simple graph

In [87]:
import networkx as nx

In [104]:
g = nx.Graph()

In [98]:
node_ids = {(h,t) for [h,r,t] in triplets}
print(len(node_ids))

4974652


In [105]:
g.add_edges_from(list(node_ids))

In [124]:
print("nodes: ", len(g.nodes()), "edges: ", len(g.edges()))

nodes:  97238 edges:  4400766


In [126]:
nx.write_gexf(g, "./test.gephi")