In [1]:
import pandas as pd
import numpy as np

In [2]:
checkpoints_path = "https://github.com/RyanWangZf/BioBridge/raw/refs/heads/main/checkpoints/pytorch_model.bin"
checkpoints_dir = "checkpoints"

import os
if not os.path.exists(checkpoints_dir):
    os.makedirs(checkpoints_dir)

In [3]:
# import urllib.request
# urllib.request.urlretrieve(checkpoints_path, os.path.join(checkpoints_dir, "pytorch_model.bin"))

In [4]:
import os
os.environ["OPENAI_API_KEY"] = "XXX"
os.environ["NVDIA_API_KEY"] = "XXX"
import sys
sys.path.append("../../../")
from aiagents4pharma.talk2knowledgegraphs.models.biobridge.model import BioBridge

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch

model = BioBridge(n_node=10, 
                  n_relation=18, 
                  proj_dim={7: 768, 0: 768, 5: 768, 1: 2560, 6: 512, 2: 768},
                  hidden_dim=768,
                  n_layer=6)
model.load_state_dict(torch.load(os.path.join(checkpoints_dir, "pytorch_model.bin")))

<All keys matched successfully>

In [6]:
@torch.no_grad()
def transformation(
    model: BioBridge, 
    x: torch.Tensor,
    src_type,
    tgt_type,
    rel_type
    ):
    """Inference based on the trained Bridge model to project raw embeddings to the target space.

    Args:
        model (BindingModel): the trained Bridge model.
        x (torch.Tensor): the raw embeddings to be projected.
        src_type (int): the type of the source space.
        tgt_type (int): the type of the target space.
        rel_type (int): the type of the relation.
    """
    if torch.cuda.is_available():
        x = x.to("cuda:0")
        model = model.to("cuda:0")
    
    model.eval()
    head_type_ids = torch.tensor([src_type] * len(x)).to(x.device)
    rel_type_ids = torch.tensor([rel_type] * len(x)).to(x.device)
    tail_type_ids = torch.tensor([tgt_type] * len(x)).to(x.device)
    output = model(
        head_emb=x,
        head_type_ids=head_type_ids,
        rel_type_ids=rel_type_ids,
        tail_type_ids=tail_type_ids,
    )
    return output['embeddings']


@torch.no_grad()
def project(
    model: BioBridge,
    x: torch.Tensor,
    src_type: int,
    ):
    """Project the raw embeddings to it's space with the modality-specific projection head"""
    if torch.cuda.is_available():
        x = x.to("cuda:0")
        model.to("cuda:0")
    
    model.eval()
    output = model.projection(
        node_emb=x,
        node_type_id=src_type,
    )
    return output

In [7]:
import sys
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg import BioBridgePrimeKG

# Define biobridge primekg data by providing a local directory where the data is stored
biobridge_data = BioBridgePrimeKG(primekg_dir="../../../../data/primekg/",
                                  local_dir="../../../../data/biobridge_primekg/")

# Invoke a method to load the data
biobridge_data.load_data()

# Get the node information of the BioBridge PrimeKG
biobridge_node_info = biobridge_data.get_node_info_dict()
biobridge_node_info.keys()

Loading PrimeKG dataset...
Loading nodes of PrimeKG dataset ...
../../../../data/primekg/primekg_nodes.tsv.gz already exists. Loading the data from the local directory.
Loading edges of PrimeKG dataset ...
../../../../data/primekg/primekg_edges.tsv.gz already exists. Loading the data from the local directory.
Loading data config file of BioBridgePrimeKG...
File data_config.json already exists in ../../../../data/biobridge_primekg/.
Building node embeddings...
Building full triplets...
Building train-test split...
Building negative triplets...


dict_keys(['gene/protein', 'molecular_function', 'cellular_component', 'biological_process', 'drug', 'disease'])

In [8]:
node_embeddings = biobridge_data.get_node_embeddings()

In [9]:
triplets = biobridge_data.get_primekg_triplets()
triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
0,0,PHYHIP,NCBI,9796,1,8889,KIF15,NCBI,56992,1,3,protein_protein
1,1,GPANK1,NCBI,7918,1,2798,PNMA1,NCBI,9240,1,3,protein_protein
2,2,ZRSR2,NCBI,8233,1,5646,TTC33,NCBI,23548,1,3,protein_protein
3,3,NRF1,NCBI,4899,1,11592,MAN1B1,NCBI,11253,1,3,protein_protein
4,4,PI4KA,NCBI,5297,1,2122,RGS20,NCBI,8601,1,3,protein_protein


### Gene/Protein --> Interactions With --> Cellular Component Retrieval

In [10]:
import pandas as pd
import numpy as np

cellcomp_protein_triplets = triplets[((triplets.head_type==1) & (triplets.tail_type==7))]
cellcomp_protein_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2858483,7097,A1BG,NCBI,1,1,126078,ficolin-1-rich granule lumen,GO,1904813,7,2,cellcomp_protein
2858484,6561,ACLY,NCBI,47,1,126078,ficolin-1-rich granule lumen,GO,1904813,7,2,cellcomp_protein
2858485,5420,AGL,NCBI,178,1,126078,ficolin-1-rich granule lumen,GO,1904813,7,2,cellcomp_protein
2858486,7661,ALAD,NCBI,210,1,126078,ficolin-1-rich granule lumen,GO,1904813,7,2,cellcomp_protein
2858487,652,ALDOA,NCBI,226,1,126078,ficolin-1-rich granule lumen,GO,1904813,7,2,cellcomp_protein


In [11]:
cellcomp_protein_triplets[cellcomp_protein_triplets.head_name == 'IL6']

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2859040,1567,IL6,NCBI,3569,1,124245,extracellular space,GO,5615,7,2,cellcomp_protein
2863248,1567,IL6,NCBI,3569,1,56174,extracellular region,GO,5576,7,2,cellcomp_protein
2913339,1567,IL6,NCBI,3569,1,56117,endoplasmic reticulum lumen,GO,5788,7,2,cellcomp_protein
2929976,1567,IL6,NCBI,3569,1,124639,interleukin-6 receptor complex,GO,5896,7,2,cellcomp_protein


In [13]:
x = torch.tensor(node_embeddings[1567]).unsqueeze(0)
x_transformed = transformation(model, x, 1, 7, 2)

In [14]:
cellcomp_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in cellcomp_protein_triplets.tail_index.unique()])
# cellcomp_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in [56241, 55840]])
cellcomp_as_tail_embeddings = project(model, cellcomp_as_tail_embeddings, 7)
cellcomp_as_tail_embeddings

tensor([[-0.0807, -0.2004,  0.2387,  ..., -0.0242, -0.0323, -0.1182],
        [ 0.0141, -0.0231, -0.0490,  ...,  0.0686,  0.0427, -0.0151],
        [-0.0069, -0.0176, -0.0296,  ...,  0.0992,  0.0151, -0.0302],
        ...,
        [ 0.0286,  0.0043,  0.1065,  ..., -0.1542, -0.0064, -0.0260],
        [ 0.0459, -0.0069,  0.0891,  ...,  0.0108,  0.1443,  0.0470],
        [-0.0481,  0.0733,  0.0166,  ..., -0.0518,  0.0415, -0.0794]],
       device='cuda:0')

In [15]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), cellcomp_as_tail_embeddings.cpu())
sim_scores


tensor([ 0.3612,  0.8193,  0.4413,  ..., -0.3088, -0.4428, -0.1809])

In [18]:
cellcomp_protein_triplets[(cellcomp_protein_triplets.head_index == 1567)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2859040,1567,IL6,NCBI,3569,1,124245,extracellular space,GO,5615,7,2,cellcomp_protein
2863248,1567,IL6,NCBI,3569,1,56174,extracellular region,GO,5576,7,2,cellcomp_protein
2913339,1567,IL6,NCBI,3569,1,56117,endoplasmic reticulum lumen,GO,5788,7,2,cellcomp_protein
2929976,1567,IL6,NCBI,3569,1,124639,interleukin-6 receptor complex,GO,5896,7,2,cellcomp_protein


In [21]:
k_nodes = 10
sim_sorted = cellcomp_protein_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,124245,extracellular space,GO,5615,cellular_component
1,56174,extracellular region,GO,5576,cellular_component
2,126196,extracellular exosome,GO,70062,cellular_component
3,124479,cell surface,GO,9986,cellular_component
4,56117,endoplasmic reticulum lumen,GO,5788,cellular_component
5,55652,lysosomal lumen,GO,43202,cellular_component
6,56263,plasma membrane,GO,5886,cellular_component
7,56099,collagen-containing extracellular matrix,GO,62023,cellular_component
8,55774,lysosome,GO,5764,cellular_component
9,56241,cytoplasm,GO,5737,cellular_component


In [22]:
cellcomp_protein_triplets[(cellcomp_protein_triplets.head_index == 1567) & (cellcomp_protein_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2859040,1567,IL6,NCBI,3569,1,124245,extracellular space,GO,5615,7,2,cellcomp_protein
2863248,1567,IL6,NCBI,3569,1,56174,extracellular region,GO,5576,7,2,cellcomp_protein
2913339,1567,IL6,NCBI,3569,1,56117,endoplasmic reticulum lumen,GO,5788,7,2,cellcomp_protein


### Gene/Protein --> Interactions With --> Biological Process

In [31]:
biop_protein_triplets = triplets[((triplets.head_type==1) & (triplets.tail_type==0))]
biop_protein_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2933235,7097,A1BG,NCBI,1,1,112487,neutrophil degranulation,GO,43312,0,2,bioprocess_protein
2933236,6931,SERPINA3,NCBI,12,1,112487,neutrophil degranulation,GO,43312,0,2,bioprocess_protein
2933237,1114,AOC1,NCBI,26,1,112487,neutrophil degranulation,GO,43312,0,2,bioprocess_protein
2933238,4852,ACAA1,NCBI,30,1,112487,neutrophil degranulation,GO,43312,0,2,bioprocess_protein
2933239,6561,ACLY,NCBI,47,1,112487,neutrophil degranulation,GO,43312,0,2,bioprocess_protein


In [32]:
x = torch.tensor(node_embeddings[1567]).unsqueeze(0)
x_transformed = transformation(model, x, 1, 0, 2)

In [33]:
biop_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in biop_protein_triplets.tail_index.unique()])
biop_as_tail_embeddings = project(model, biop_as_tail_embeddings, 0)
biop_as_tail_embeddings

tensor([[-0.0241, -0.0629, -0.0151,  ...,  0.0211, -0.0025, -0.0417],
        [ 0.0296, -0.0351, -0.0138,  ...,  0.0004,  0.0190, -0.0224],
        [-0.0776,  0.0672, -0.0461,  ...,  0.0166,  0.0414,  0.0134],
        ...,
        [-0.0183, -0.0412, -0.0489,  ..., -0.0369, -0.0756, -0.0308],
        [ 0.1868,  0.0523, -0.1393,  ..., -0.1839, -0.0607, -0.0005],
        [-0.0438,  0.0324, -0.0458,  ..., -0.0995,  0.0609,  0.0207]],
       device='cuda:0')

In [34]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), biop_as_tail_embeddings.cpu())
sim_scores


tensor([ 0.4158,  0.3256,  0.2141,  ...,  0.0330, -0.2843,  0.0376])

In [35]:
biop_protein_triplets[(biop_protein_triplets.head_index == 1567)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2934324,1567,IL6,NCBI,3569,1,53288,acute-phase response,GO,6953,0,2,bioprocess_protein
2934454,1567,IL6,NCBI,3569,1,39999,inflammatory response,GO,6954,0,2,bioprocess_protein
2937497,1567,IL6,NCBI,3569,1,101981,cellular response to lipopolysaccharide,GO,71222,0,2,bioprocess_protein
2942054,1567,IL6,NCBI,3569,1,40924,response to glucocorticoid,GO,51384,0,2,bioprocess_protein
2943098,1567,IL6,NCBI,3569,1,112055,cellular response to hydrogen peroxide,GO,70301,0,2,bioprocess_protein
...,...,...,...,...,...,...,...,...,...,...,...,...
3054960,1567,IL6,NCBI,3569,1,100267,glucagon secretion,GO,70091,0,2,bioprocess_protein
3054961,1567,IL6,NCBI,3569,1,44690,regulation of vascular endothelial growth fact...,GO,10574,0,2,bioprocess_protein
3054964,1567,IL6,NCBI,3569,1,112582,T-helper 17 cell lineage commitment,GO,72540,0,2,bioprocess_protein
3054971,1567,IL6,NCBI,3569,1,106339,negative regulation of primary miRNA processing,GO,2000635,0,2,bioprocess_protein


In [36]:
k_nodes = 10
sim_sorted = biop_protein_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=False)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,101027,UDP-N-acetylglucosamine catabolic process,GO,6049,biological_process
1,41340,UDP-N-acetylglucosamine metabolic process,GO,6047,biological_process
2,104011,UDP-N-acetylglucosamine transmembrane transport,GO,1990569,biological_process
3,48968,nicotinamide nucleotide metabolic process,GO,46496,biological_process
4,53476,allantoin catabolic process,GO,256,biological_process
5,44151,tRNA threonylcarbamoyladenosine metabolic process,GO,70525,biological_process
6,41134,chondroitin sulfate metabolic process,GO,30204,biological_process
7,113839,ventral trunk neural crest cell migration,GO,36486,biological_process
8,103386,seminal vesicle epithelium development,GO,61108,biological_process
9,40786,aromatic amino acid family metabolic process,GO,9072,biological_process


In [37]:
biop_protein_triplets[(biop_protein_triplets.head_index == 1567) & (biop_protein_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation


### Gene/Protein --> Interactions With --> Molecular Function

In [38]:
molfunc_protein_triplets = triplets[((triplets.head_type==1) & (triplets.tail_type==5))]
molfunc_protein_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2791722,1048,A2M,NCBI,2,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein
2791723,227,IL1R1,NCBI,3554,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein
2791724,1654,IL1R2,NCBI,7850,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein
2791725,4761,HAX1,NCBI,10456,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein
2791726,10311,TRIM16,NCBI,10626,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein


In [39]:
x = torch.tensor(node_embeddings[227]).unsqueeze(0)
x_transformed = transformation(model, x, 1, 5, 2)

In [40]:
molfunc_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in molfunc_protein_triplets.tail_index.unique()])
molfunc_as_tail_embeddings = project(model, molfunc_as_tail_embeddings, 5)
molfunc_as_tail_embeddings

tensor([[-0.0707,  0.1322,  0.1445,  ...,  0.1249,  0.0293, -0.1103],
        [-0.1251,  0.1678, -0.0270,  ..., -0.0909,  0.1080, -0.0051],
        [-0.0665, -0.0465, -0.0862,  ..., -0.0213,  0.0450,  0.0295],
        ...,
        [ 0.0389,  0.1112,  0.0184,  ..., -0.0653,  0.0009, -0.1544],
        [-0.0268, -0.2351,  0.0653,  ..., -0.0287,  0.0123, -0.0026],
        [ 0.0962, -0.0665,  0.1969,  ...,  0.0391, -0.1009, -0.1000]],
       device='cuda:0')

In [41]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), molfunc_as_tail_embeddings.cpu())
sim_scores


tensor([ 0.3567,  0.0533,  0.3069,  ..., -0.2987,  0.0435, -0.0659])

In [42]:
molfunc_protein_triplets[(molfunc_protein_triplets.head_index == 227)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2791723,227,IL1R1,NCBI,3554,1,54035,interleukin-1 binding,GO,19966,5,2,molfunc_protein
2792167,227,IL1R1,NCBI,3554,1,54671,protease binding,GO,2020,5,2,molfunc_protein
2794566,227,IL1R1,NCBI,3554,1,53699,protein binding,GO,5515,5,2,molfunc_protein
2830120,227,IL1R1,NCBI,3554,1,53920,transmembrane signaling receptor activity,GO,4888,5,2,molfunc_protein
2842435,227,IL1R1,NCBI,3554,1,124184,"NAD+ nucleotidase, cyclic ADP-ribose generating",GO,61809,5,2,molfunc_protein
2842451,227,IL1R1,NCBI,3554,1,122627,NAD(P)+ nucleosidase activity,GO,50135,5,2,molfunc_protein
2848778,227,IL1R1,NCBI,3554,1,117422,platelet-derived growth factor receptor binding,GO,5161,5,2,molfunc_protein
2851419,227,IL1R1,NCBI,3554,1,54343,interleukin-1 receptor activity,GO,4908,5,2,molfunc_protein
2851426,227,IL1R1,NCBI,3554,1,120880,"interleukin-1, type I, activating receptor act...",GO,4909,5,2,molfunc_protein


In [43]:
k_nodes = 10
sim_sorted = molfunc_protein_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_ = df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")
df_

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,124184,"NAD+ nucleotidase, cyclic ADP-ribose generating",GO,61809,molecular_function
1,54387,signaling receptor activity,GO,38023,molecular_function
2,55436,identical protein binding,GO,42802,molecular_function
3,54621,cytokine receptor activity,GO,4896,molecular_function
4,53920,transmembrane signaling receptor activity,GO,4888,molecular_function
5,55100,interleukin-17 receptor activity,GO,30368,molecular_function
6,122627,NAD(P)+ nucleosidase activity,GO,50135,molecular_function
7,124125,protein homodimerization activity,GO,42803,molecular_function
8,53699,protein binding,GO,5515,molecular_function
9,55479,NAD+ nucleosidase activity,GO,3953,molecular_function


In [44]:
molfunc_protein_triplets[(molfunc_protein_triplets.head_index == 227) & (molfunc_protein_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2794566,227,IL1R1,NCBI,3554,1,53699,protein binding,GO,5515,5,2,molfunc_protein
2830120,227,IL1R1,NCBI,3554,1,53920,transmembrane signaling receptor activity,GO,4888,5,2,molfunc_protein
2842435,227,IL1R1,NCBI,3554,1,124184,"NAD+ nucleotidase, cyclic ADP-ribose generating",GO,61809,5,2,molfunc_protein
2842451,227,IL1R1,NCBI,3554,1,122627,NAD(P)+ nucleosidase activity,GO,50135,5,2,molfunc_protein


### Protein --> Associated With --> Disease

In [45]:
rel_id_to_str_dict = dict((v,k) for k,v in biobridge_data.get_data_config()['relation_type'].items())
rel_id_to_str_dict

{0: 'expression present',
 1: 'synergistic interaction',
 2: 'interacts with',
 3: 'ppi',
 4: 'phenotype present',
 5: 'parent-child',
 6: 'associated with',
 7: 'side effect',
 8: 'contraindication',
 9: 'expression absent',
 10: 'target',
 11: 'indication',
 12: 'enzyme',
 13: 'transporter',
 14: 'off-label use',
 15: 'linked to',
 16: 'phenotype absent',
 17: 'carrier'}

In [46]:
protein_disease_triplets = triplets[((triplets.head_type==1) & (triplets.tail_type==2) & (triplets.display_relation==6))]
protein_disease_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2613785,7097,A1BG,NCBI,1,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2613786,2174,ABCA1,NCBI,19,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2613787,8038,ACHE,NCBI,43,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2613788,5925,ACP1,NCBI,52,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2613789,238,ACTB,NCBI,60,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein


In [47]:
protein_disease_triplets[protein_disease_triplets.head_name == 'IL6']

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2614065,1567,IL6,NCBI,3569,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2616054,1567,IL6,NCBI,3569,1,27933,anxiety disorder,MONDO_grouped,8187_2050_5618_5451_5371_100081_5383_1942_1098...,2,6,disease_protein
2616055,1567,IL6,NCBI,3569,1,83840,unipolar depression,MONDO,5263,2,6,disease_protein
2616056,1567,IL6,NCBI,3569,1,37703,neurotic disorder,MONDO,5379,2,6,disease_protein
2616621,1567,IL6,NCBI,3569,1,83760,dysthymic disorder,MONDO,1442,2,6,disease_protein
...,...,...,...,...,...,...,...,...,...,...,...,...
2686214,1567,IL6,NCBI,3569,1,27848,arteriovenous malformations of the brain,MONDO,7154,2,6,disease_protein
2686216,1567,IL6,NCBI,3569,1,32631,Castleman disease,MONDO,15564,2,6,disease_protein
2686217,1567,IL6,NCBI,3569,1,33450,Kimura disease,MONDO,18830,2,6,disease_protein
2686218,1567,IL6,NCBI,3569,1,32589,Kaposi's sarcoma (disease),MONDO,5055,2,6,disease_protein


In [48]:
x = torch.tensor(node_embeddings[1567]).unsqueeze(0)
x_transformed = transformation(model, x, 1, 2, 6)

In [49]:
disease_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in protein_disease_triplets.tail_index.unique()])
disease_as_tail_embeddings = project(model, disease_as_tail_embeddings, 2)
disease_as_tail_embeddings

  disease_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in protein_disease_triplets.tail_index.unique()])


tensor([[ 0.0253,  0.0084, -0.0078,  ...,  0.0232,  0.0047, -0.0244],
        [ 0.0379,  0.0670, -0.0542,  ...,  0.0828,  0.0433, -0.0453],
        [ 0.0105,  0.0471, -0.0431,  ...,  0.0377,  0.0832, -0.0056],
        ...,
        [ 0.0429,  0.2275, -0.0458,  ...,  0.0957,  0.1318, -0.0151],
        [ 0.0318,  0.0114, -0.0764,  ...,  0.0408,  0.0470, -0.0107],
        [ 0.1283, -0.1426,  0.0293,  ...,  0.1421, -0.1253,  0.0153]],
       device='cuda:0')

In [50]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), disease_as_tail_embeddings.cpu())
sim_scores


tensor([0.6842, 0.4190, 0.1399,  ..., 0.2393, 0.1273, 0.0979])

In [53]:
protein_disease_triplets[(protein_disease_triplets.head_index == 1567)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2614065,1567,IL6,NCBI,3569,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2616054,1567,IL6,NCBI,3569,1,27933,anxiety disorder,MONDO_grouped,8187_2050_5618_5451_5371_100081_5383_1942_1098...,2,6,disease_protein
2616055,1567,IL6,NCBI,3569,1,83840,unipolar depression,MONDO,5263,2,6,disease_protein
2616056,1567,IL6,NCBI,3569,1,37703,neurotic disorder,MONDO,5379,2,6,disease_protein
2616621,1567,IL6,NCBI,3569,1,83760,dysthymic disorder,MONDO,1442,2,6,disease_protein
...,...,...,...,...,...,...,...,...,...,...,...,...
2686214,1567,IL6,NCBI,3569,1,27848,arteriovenous malformations of the brain,MONDO,7154,2,6,disease_protein
2686216,1567,IL6,NCBI,3569,1,32631,Castleman disease,MONDO,15564,2,6,disease_protein
2686217,1567,IL6,NCBI,3569,1,33450,Kimura disease,MONDO,18830,2,6,disease_protein
2686218,1567,IL6,NCBI,3569,1,32589,Kaposi's sarcoma (disease),MONDO,5055,2,6,disease_protein


In [51]:
k_nodes = 10
sim_sorted = protein_disease_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_ = df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")
df_

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,32617,hereditary breast ovarian cancer syndrome,MONDO,3582,disease
1,29646,hepatocellular carcinoma,MONDO_grouped,7256_18902_16216_3243,disease
2,35963,prostate carcinoma,MONDO,5159,disease
3,36691,breast cancer,MONDO,7254,disease
4,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,disease
5,28347,hereditary breast carcinoma,MONDO,16419,disease
6,36634,liver cancer,MONDO,2691,disease
7,30826,familial prostate carcinoma,MONDO,23122,disease
8,31116,lung cancer,MONDO,8903,disease
9,36999,adenocarcinoma of liver and intrahepatic bilia...,MONDO,18532,disease


In [52]:
protein_disease_triplets[(protein_disease_triplets.head_index == 1567) & (protein_disease_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
2614065,1567,IL6,NCBI,3569,1,28313,schizophrenia,MONDO_grouped,5090_13498_8414_10897_33312_10943_11552_14092_...,2,6,disease_protein
2617303,1567,IL6,NCBI,3569,1,31116,lung cancer,MONDO,8903,2,6,disease_protein
2618686,1567,IL6,NCBI,3569,1,29646,hepatocellular carcinoma,MONDO_grouped,7256_18902_16216_3243,2,6,disease_protein
2618687,1567,IL6,NCBI,3569,1,36634,liver cancer,MONDO,2691,2,6,disease_protein
2618690,1567,IL6,NCBI,3569,1,36999,adenocarcinoma of liver and intrahepatic bilia...,MONDO,18532,2,6,disease_protein
2622616,1567,IL6,NCBI,3569,1,30826,familial prostate carcinoma,MONDO,23122,2,6,disease_protein
2622617,1567,IL6,NCBI,3569,1,35963,prostate carcinoma,MONDO,5159,2,6,disease_protein
2625489,1567,IL6,NCBI,3569,1,36691,breast cancer,MONDO,7254,2,6,disease_protein
2625490,1567,IL6,NCBI,3569,1,28347,hereditary breast carcinoma,MONDO,16419,2,6,disease_protein
2625492,1567,IL6,NCBI,3569,1,32617,hereditary breast ovarian cancer syndrome,MONDO,3582,2,6,disease_protein


### Drug --> target --> Protein

In [54]:
drug_protein_triplets = triplets[((triplets.head_type==6) & (triplets.tail_type==1) & (triplets.display_relation==10))]
drug_protein_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
320478,15906,Pyridoxal phosphate,DrugBank,DB00114,6,13174,HDC,NCBI,3067,1,10,drug_protein
320479,15907,Histidine,DrugBank,DB00117,6,13174,HDC,NCBI,3067,1,10,drug_protein
320480,15908,Glutamic acid,DrugBank,DB00142,6,13982,GLS2,NCBI,27165,1,10,drug_protein
320481,15909,N-Acetyl-Serine,DrugBank,DB02340,6,476,F13A1,NCBI,2162,1,10,drug_protein
320482,15912,Arginine,DrugBank,DB00125,6,373,NOS2,NCBI,4843,1,10,drug_protein


In [55]:
drug_protein_triplets[drug_protein_triplets.head_name == 'VX-702']

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
325600,17590,VX-702,DrugBank,DB05470,6,2329,TNF,NCBI,7124,1,10,drug_protein
327092,17590,VX-702,DrugBank,DB05470,6,1567,IL6,NCBI,3569,1,10,drug_protein
327601,17590,VX-702,DrugBank,DB05470,6,197,MAPK14,NCBI,1432,1,10,drug_protein
327756,17590,VX-702,DrugBank,DB05470,6,1004,IL1B,NCBI,3553,1,10,drug_protein


In [56]:
x = torch.tensor(node_embeddings[17590]).unsqueeze(0)
x_transformed = transformation(model, x, 6, 1, 10)

In [57]:
protein_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in drug_protein_triplets.tail_index.unique()])
protein_as_tail_embeddings = project(model, protein_as_tail_embeddings, 1)
protein_as_tail_embeddings

tensor([[-0.0190,  0.0135, -0.0154,  ...,  0.0074,  0.0060, -0.0069],
        [ 0.0280,  0.0027, -0.0253,  ...,  0.0250, -0.0447,  0.0160],
        [ 0.0239,  0.0142, -0.0741,  ...,  0.0064,  0.0252, -0.0375],
        ...,
        [ 0.0937, -0.0517, -0.0641,  ..., -0.0124,  0.0872, -0.0445],
        [-0.0223, -0.0520, -0.0490,  ..., -0.0019,  0.0135,  0.0072],
        [-0.0574,  0.0115, -0.0159,  ..., -0.0164, -0.0439,  0.0058]],
       device='cuda:0')

In [58]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), protein_as_tail_embeddings.cpu())
sim_scores


tensor([0.0406, 0.1026, 0.1759,  ..., 0.1038, 0.2429, 0.1229])

In [59]:
k_nodes = 10
sim_sorted = drug_protein_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_ = df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")
df_

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,1004,IL1B,NCBI,3553,gene/protein
1,1567,IL6,NCBI,3569,gene/protein
2,2329,TNF,NCBI,7124,gene/protein
3,373,NOS2,NCBI,4843,gene/protein
4,3495,IFNG,NCBI,3458,gene/protein
5,4425,PTGS2,NCBI,5743,gene/protein
6,4959,SOD2,NCBI,6648,gene/protein
7,9031,PTGS1,NCBI,5742,gene/protein
8,2617,IL2,NCBI,3558,gene/protein
9,4497,NOS3,NCBI,4846,gene/protein


In [60]:
drug_protein_triplets[(drug_protein_triplets.head_index == 17590) & (drug_protein_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
325600,17590,VX-702,DrugBank,DB05470,6,2329,TNF,NCBI,7124,1,10,drug_protein
327092,17590,VX-702,DrugBank,DB05470,6,1567,IL6,NCBI,3569,1,10,drug_protein
327756,17590,VX-702,DrugBank,DB05470,6,1004,IL1B,NCBI,3553,1,10,drug_protein


In [61]:
triplets[(triplets.tail_index == 17590)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
3395156,2329,TNF,NCBI,7124,1,17590,VX-702,DrugBank,DB05470,6,10,drug_protein
3396648,1567,IL6,NCBI,3569,1,17590,VX-702,DrugBank,DB05470,6,10,drug_protein
3397157,197,MAPK14,NCBI,1432,1,17590,VX-702,DrugBank,DB05470,6,10,drug_protein
3397312,1004,IL1B,NCBI,3553,1,17590,VX-702,DrugBank,DB05470,6,10,drug_protein


### Drug --> indication --> Disease

In [62]:
disease_drug_triplets = triplets[((triplets.head_type==6) & (triplets.tail_type==2) & (triplets.relation == 'indication'))]
disease_drug_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
338350,16687,Fosinopril,DrugBank,DB00492,6,33577,hypertensive disorder,MONDO,5044,2,11,indication
338351,16687,Fosinopril,DrugBank,DB00492,6,36035,hypertension,MONDO_grouped,1200_1134_15512_5080_100078,2,11,indication
338382,20297,Imidapril,DrugBank,DB11783,6,33577,hypertensive disorder,MONDO,5044,2,11,indication
338383,20297,Imidapril,DrugBank,DB11783,6,36035,hypertension,MONDO_grouped,1200_1134_15512_5080_100078,2,11,indication
338386,16693,Cilazapril,DrugBank,DB01340,6,33577,hypertensive disorder,MONDO,5044,2,11,indication


In [63]:
x = torch.tensor(node_embeddings[15876]).unsqueeze(0)
x_transformed = transformation(model, x, 6, 2, 11)

In [64]:
disease_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in disease_drug_triplets.tail_index.unique()])
disease_as_tail_embeddings = project(model, disease_as_tail_embeddings, 2)
disease_as_tail_embeddings

tensor([[ 0.0409,  0.0292, -0.0421,  ...,  0.0431,  0.0146, -0.0288],
        [ 0.0611,  0.0414, -0.0743,  ...,  0.0438,  0.0159,  0.0148],
        [-0.0644, -0.0216,  0.0536,  ...,  0.0428,  0.0860, -0.0244],
        ...,
        [ 0.0692,  0.0380, -0.0210,  ..., -0.0021,  0.0120, -0.0779],
        [-0.1167, -0.1835, -0.0063,  ..., -0.1303,  0.0858,  0.0656],
        [ 0.1024, -0.0349,  0.0101,  ..., -0.1269, -0.1475, -0.0919]],
       device='cuda:0')

In [65]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), disease_as_tail_embeddings.cpu())
sim_scores


tensor([ 0.0512,  0.0664, -0.0638,  ...,  0.1381,  0.0153,  0.1727])

In [66]:
disease_drug_triplets[(disease_drug_triplets.head_index == 15876)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
364546,15876,Mesalazine,DrugBank,DB00244,6,37785,ulcerative colitis (disease),MONDO,5101,2,11,indication
364547,15876,Mesalazine,DrugBank,DB00244,6,28158,inflammatory bowel disease,MONDO_grouped,9960_12845_33643_11471_12831_12875_12941_13153...,2,11,indication
372669,15876,Mesalazine,DrugBank,DB00244,6,83959,ulcerative proctosigmoiditis,MONDO,7005,2,11,indication


In [67]:
k_nodes = 10
sim_sorted = disease_drug_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_ = df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")
df_

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,36209,arthropathy,MONDO,6816,disease
1,29273,osteoarthritis susceptibility,MONDO_grouped,8143_7704_11923_12893_12894_12568,disease
2,29078,rheumatoid arthritis,MONDO,8383,disease
3,33609,osteoarthritis,MONDO_grouped,5178_6629_5416_6630_6631_6632,disease
4,28158,inflammatory bowel disease,MONDO_grouped,9960_12845_33643_11471_12831_12875_12941_13153...,disease
5,36265,spondyloarthropathy,MONDO_grouped,5095_43377,disease
6,37785,ulcerative colitis (disease),MONDO,5101,disease
7,83770,Crohn's colitis,MONDO,5532,disease
8,32325,juvenile idiopathic arthritis,MONDO_grouped,11429_19433,disease
9,37784,Crohn disease,MONDO_grouped,5011_5535,disease


In [68]:
disease_drug_triplets[(disease_drug_triplets.head_index == 15876) & (disease_drug_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
364546,15876,Mesalazine,DrugBank,DB00244,6,37785,ulcerative colitis (disease),MONDO,5101,2,11,indication
364547,15876,Mesalazine,DrugBank,DB00244,6,28158,inflammatory bowel disease,MONDO_grouped,9960_12845_33643_11471_12831_12875_12941_13153...,2,11,indication


### Drug --> contraindication --> Disease

In [69]:
disease_drug_triplets = triplets[((triplets.head_type==6) & (triplets.tail_type==2) & (triplets.relation == 'contraindication'))]
disease_drug_triplets.head()

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
338348,15193,Rotigotine,DrugBank,DB05271,6,33577,hypertensive disorder,MONDO,5044,2,8,contraindication
338349,15193,Rotigotine,DrugBank,DB05271,6,36035,hypertension,MONDO_grouped,1200_1134_15512_5080_100078,2,8,contraindication
338352,14483,Estradiol valerate,DrugBank,DB13956,6,33577,hypertensive disorder,MONDO,5044,2,8,contraindication
338353,14483,Estradiol valerate,DrugBank,DB13956,6,36035,hypertension,MONDO_grouped,1200_1134_15512_5080_100078,2,8,contraindication
338354,16476,Phenazopyridine,DrugBank,DB01438,6,33577,hypertensive disorder,MONDO,5044,2,8,contraindication


In [70]:
x = torch.tensor(node_embeddings[20377]).unsqueeze(0)
x_transformed = transformation(model, x, 6, 2, 11)

In [71]:
disease_as_tail_embeddings = torch.tensor([node_embeddings[i] for i in disease_drug_triplets.tail_index.unique()])
disease_as_tail_embeddings = project(model, disease_as_tail_embeddings, 2)
disease_as_tail_embeddings

tensor([[ 0.0409,  0.0292, -0.0421,  ...,  0.0431,  0.0146, -0.0288],
        [ 0.0611,  0.0414, -0.0743,  ...,  0.0438,  0.0159,  0.0148],
        [ 0.0391,  0.0119, -0.0164,  ...,  0.0404, -0.0121, -0.0239],
        ...,
        [-0.0599, -0.0262, -0.0585,  ..., -0.0386, -0.1117, -0.0371],
        [ 0.0305,  0.1019, -0.0703,  ...,  0.1123,  0.0256,  0.0380],
        [ 0.0706, -0.0381,  0.0131,  ...,  0.0241,  0.0096,  0.0194]],
       device='cuda:0')

In [72]:
sim_scores = torch.nn.CosineSimilarity(dim=-1)(x_transformed.cpu(), disease_as_tail_embeddings.cpu())
sim_scores


tensor([ 0.1958,  0.1446,  0.0692,  ..., -0.1085,  0.1501,  0.1509])

In [73]:
disease_drug_triplets[(disease_drug_triplets.head_index == 20377)]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
340948,20377,Benzoyl peroxide,DrugBank,DB09096,6,28756,photoparoxysmal response,MONDO_grouped,7559_12304_12305,2,8,contraindication
340949,20377,Benzoyl peroxide,DrugBank,DB09096,6,37762,photosensitivity disease,MONDO,6597,2,8,contraindication
340962,20377,Benzoyl peroxide,DrugBank,DB09096,6,39774,anogenital human papillomavirus infection,MONDO,5647,2,8,contraindication
340963,20377,Benzoyl peroxide,DrugBank,DB09096,6,83754,common wart,MONDO,1209,2,8,contraindication
342047,20377,Benzoyl peroxide,DrugBank,DB09096,6,35764,kidney disease,MONDO,5240,2,8,contraindication
342048,20377,Benzoyl peroxide,DrugBank,DB09096,6,83762,pyoureter,MONDO,1922,2,8,contraindication
346162,20377,Benzoyl peroxide,DrugBank,DB09096,6,38666,adrenocortical insufficiency,MONDO,4,2,8,contraindication
346163,20377,Benzoyl peroxide,DrugBank,DB09096,6,30813,Addison disease,MONDO,9410,2,8,contraindication
346644,20377,Benzoyl peroxide,DrugBank,DB09096,6,37784,Crohn disease,MONDO_grouped,5011_5535,2,8,contraindication
346645,20377,Benzoyl peroxide,DrugBank,DB09096,6,28158,inflammatory bowel disease,MONDO_grouped,9960_12845_33643_11471_12831_12875_12941_13153...,2,8,contraindication


In [74]:
k_nodes = 10
sim_sorted = disease_drug_triplets.tail_index.unique()[torch.argsort(sim_scores, descending=True)[:k_nodes]]
df_ = pd.DataFrame({
    "node_index": sim_sorted
})
df_ = df_.merge(biobridge_data.get_primekg().get_nodes(), on="node_index")
df_

Unnamed: 0,node_index,node_name,node_source,node_id,node_type
0,29709,"dermatitis, atopic",MONDO,11292,disease
1,36924,dermatitis,MONDO,2406,disease
2,37770,contact dermatitis,MONDO,5480,disease
3,36129,atopic eczema,MONDO,4980,disease
4,33651,seborrheic dermatitis,MONDO_grouped,6608_6609,disease
5,83777,occupational dermatitis,MONDO,6589,disease
6,83782,exfoliative dermatitis,MONDO,43233,disease
7,36644,skin disease,MONDO,5093,disease
8,33534,pediatric systemic lupus erythematosus,MONDO,19725,disease
9,31850,seborrheic keratosis,MONDO,8420,disease


In [75]:
disease_drug_triplets[(disease_drug_triplets.head_index == 20377) & (disease_drug_triplets.tail_index.isin(df_.node_index))]

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation
348974,20377,Benzoyl peroxide,DrugBank,DB09096,6,83782,exfoliative dermatitis,MONDO,43233,2,8,contraindication
361053,20377,Benzoyl peroxide,DrugBank,DB09096,6,36924,dermatitis,MONDO,2406,2,8,contraindication
