In [8]:
# Import necessary libraries
import os
import sys
import torch
import networkx as nx
from tqdm import tqdm
import json
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.datasets.primekg import PrimeKG

In [2]:
# Define primekg data by providing a local directory where the data is stored
primekg_data = PrimeKG(local_dir="../../../../data/primekg/")

In [3]:
# Invoke a method to load the data
primekg_data.load_data()

# Get primekg_nodes and primekg_edges
primekg_nodes = primekg_data.get_nodes()
primekg_edges = primekg_data.get_edges()

Loading nodes of PrimeKG dataset ...
../../../../data/primekg/primekg_nodes.tsv.gz already exists. Loading the data from the local directory.
Loading edges of PrimeKG dataset ...
../../../../data/primekg/primekg_edges.tsv.gz already exists. Loading the data from the local directory.


In [6]:
# As of now, we are enriching each edge using textual information
# Perform textual enrichment over the edges by simply concatenating the head and tail nodes with the relation followed by the enriched node information
text_enriched_edges = primekg_edges.apply(lambda x: f"{x['head_name']} ({x['head_type']}) has a direct relationship of {x['relation']}:{x['display_relation']} with {x['tail_name']} ({x['tail_type']}).", axis=1).tolist()
primekg_edges['feat'] = text_enriched_edges
primekg_edges.head(5)

Unnamed: 0,head_index,head_name,head_source,head_id,head_type,tail_index,tail_name,tail_source,tail_id,tail_type,display_relation,relation,feat
0,0,PHYHIP,NCBI,9796,gene/protein,8889,KIF15,NCBI,56992,gene/protein,ppi,protein_protein,PHYHIP (gene/protein) has a direct relationshi...
1,1,GPANK1,NCBI,7918,gene/protein,2798,PNMA1,NCBI,9240,gene/protein,ppi,protein_protein,GPANK1 (gene/protein) has a direct relationshi...
2,2,ZRSR2,NCBI,8233,gene/protein,5646,TTC33,NCBI,23548,gene/protein,ppi,protein_protein,ZRSR2 (gene/protein) has a direct relationship...
3,3,NRF1,NCBI,4899,gene/protein,11592,MAN1B1,NCBI,11253,gene/protein,ppi,protein_protein,NRF1 (gene/protein) has a direct relationship ...
4,4,PI4KA,NCBI,5297,gene/protein,2122,RGS20,NCBI,8601,gene/protein,ppi,protein_protein,PI4KA (gene/protein) has a direct relationship...


In [11]:
# Embeddings using OpenAI API batch processing
batch_size = 50000
output_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/primekg/preprocessing/edges/'
os.makedirs(output_dir, exist_ok=True)

# Loop through the nodes in batches to process embeddings
docs = primekg_edges.feat.to_list()
doc_ids = primekg_edges[["head_index",
                         "tail_index",
                         "display_relation"]].apply(lambda x: f"{x['head_index']}_{x['tail_index']}_{x['display_relation']}", axis=1).to_list()

In [14]:
edges_feat_batch_filenames = []
for i in range(0, len(docs), batch_size):
    batch_docs = docs[i:i + batch_size]
    batch_doc_ids = doc_ids[i:i + batch_size]
    batch_filename = os.path.join(output_dir, f'edges_feat_batch_{i // batch_size + 1:03}.jsonl')
    edges_feat_batch_filenames.append(batch_filename)
    # Write the batch to a file
    with open(batch_filename, 'w', encoding='utf-8') as f:
        for idx, text in enumerate(batch_docs):
            record = {
                "custom_id": f"text_{batch_doc_ids[idx]}",
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "model": "text-embedding-ada-002",
                    "input": text
                }
            }
            f.write(json.dumps(record) + '\n')