### PrimeKG Subgraph Construction (Multi-Modal)

In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm
import torch 
from torch_geometric.utils import from_networkx
import sys
sys.path.append('../../../../')
from aiagents4pharma.talk2knowledgegraphs.datasets.starkqa_primekg import StarkQAPrimeKG
from aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg import BioBridgePrimeKG
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
from aiagents4pharma.talk2knowledgegraphs.utils import kg_utils

# # Set the logging level for httpx to WARNING to suppress INFO messages
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm
INFO:aiagents4pharma.talk2scholars.tools.pdf.question_and_answer:Loaded Question and Answer tool configuration.


In [2]:
# Define biobridge primekg data by providing a local directory where the data is stored
biobridge_data = BioBridgePrimeKG(primekg_dir="../../../../../data/primekg/",
                                  local_dir="../../../../../data/biobridge_primekg/")

# Invoke a method to load the data
biobridge_data.load_data()

# Get the node information of the BioBridge PrimeKG
biobridge_node_info = biobridge_data.get_node_info_dict()
biobridge_node_info.keys()

Loading PrimeKG dataset...
Loading nodes of PrimeKG dataset ...
../../../../../data/primekg/primekg_nodes.tsv.gz already exists. Loading the data from the local directory.
Loading edges of PrimeKG dataset ...
../../../../../data/primekg/primekg_edges.tsv.gz already exists. Loading the data from the local directory.
Loading data config file of BioBridgePrimeKG...
File data_config.json already exists in ../../../../../data/biobridge_primekg/.
Building node embeddings...
Building full triplets...
Building train-test split...
Building negative triplets...


dict_keys(['gene/protein', 'molecular_function', 'cellular_component', 'biological_process', 'drug', 'disease'])

In [3]:
train_split = biobridge_data.get_primekg_triplets_negative()
test_split = biobridge_data.get_train_test_split()["test"]
node_train_split = biobridge_data.get_train_test_split()["node_train"]
node_test_split = biobridge_data.get_train_test_split()["node_test"]

df_all = biobridge_data.get_primekg_triplets()
df_node_all = pd.concat([node_train_split, node_test_split], axis=0).reset_index(drop=True)

# drop duplicate nodes and triples
train_split = train_split.drop_duplicates(subset=["head_index", "tail_index", "display_relation"]).reset_index(drop=True)
# test_split = test_split.drop_duplicates(subset=["head_index", "tail_index", "display_relation"]).reset_index(drop=True)
node_train_split = node_train_split.drop_duplicates(subset=["node_index"]).reset_index(drop=True)
node_test_split = node_test_split.drop_duplicates(subset=["node_index"]).reset_index(drop=True)
df_all = df_all.drop_duplicates(subset=["head_index", "tail_index", "display_relation"]).reset_index(drop=True)
df_node_all = df_node_all.drop_duplicates(subset=["node_index"]).reset_index(drop=True)

split_data = {
    "train": train_split,
    "test": test_split,
    "node_train": node_train_split,
    "node_test": node_test_split,
    "all": df_all,
    "node_all": df_node_all,
}