In [1]:
# Import necessary libraries
import os
import networkx as nx
import openai 
import matplotlib.pyplot as plt
import sys
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import hydra
import numpy as np
# import pandas as pd
import cudf
import cupy as cp
import cuvs
import pickle
import torch

sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.utils.extractions.cu_multimodal_pcst import MultimodalPCSTPruning
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.sentence_transformer import EmbeddingWithSentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["OPENAI_API_KEY"] = "XXX"
# Make sure to replace "your_api_key" with your actual API key.

In [3]:
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
# Load hydra configuration
with hydra.initialize(version_base=None, config_path="../../../aiagents4pharma/talk2knowledgegraphs/configs"):
    cfg = hydra.compose(
        config_name="config", overrides=["tools/multimodal_subgraph_extraction=default"]
    )
    cfg = cfg.tools.multimodal_subgraph_extraction
cfg

{'_target_': 'talk2knowledgegraphs.tools.multimodal_subgraph_extraction', 'ollama_embeddings': ['nomic-embed-text'], 'temperature': 0.1, 'streaming': False, 'topk': 5, 'topk_e': 5, 'cost_e': 0.5, 'c_const': 0.01, 'root': -1, 'num_clusters': 1, 'pruning': 'gw', 'verbosity_level': 0, 'node_id_column': 'node_id', 'node_attr_column': 'node_attr', 'edge_src_column': 'edge_src', 'edge_attr_column': 'edge_attr', 'edge_dst_column': 'edge_dst', 'node_colors_dict': {'gene/protein': '#6a79f7', 'molecular_function': '#82cafc', 'cellular_component': '#3f9b0b', 'biological_process': '#c5c9c7', 'drug': '#c4a661', 'disease': '#80013f'}}

In [75]:
# Define state
state = {
    "llm_model": ChatOpenAI(model="gpt-4o-mini", temperature=0.0),
    "embedding_model": OpenAIEmbeddings(model="text-embedding-3-small"),
    "selected_genes": [], #["IL6_(1567)", "IL21_(34967)", "TNF_(2329)"],
    "selected_drugs": [], #["Remdesivir_(15267)", "Mesalazine_(15876)"],
    "uploaded_files": [
        {
            "file_name": "multimodal-analysis.csv",
            "file_path": '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/multimodal-analysis.csv',
            "file_type": "multimodal",
            "uploaded_by": "VPEUser",
            "uploaded_timestamp": "2024-11-05 00:00:00",
        },
    ],
    "topk_nodes": 5,
    "topk_edges": 5,
    "dic_source_graph": [
        {
            "name": "PrimeKG",
            "kg_pyg_path": "../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal_pyg_graph.pkl",
            "kg_text_path": "../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal_text_graph.pkl",
        }
    ],
    "dic_extracted_graph": []
}

# Define prompt
prompt = """
DrugA is a human monoclonal antibody that binds to both the soluble and transmembrane bioactive forms of human TNFa (UniProt Acc: P01375). 
This interaction prevents the binding of TNFa to its receptors, thereby inhibiting the biological activity of TNFa (a cytokine protein).


Please extract a subgraph and perform reasoning over it as evidence to explain these mechanisms of action of the given drug. Please set the extraction name for this process as `subkg_druga_tnfa`.
"""

In [76]:
# Retrieve source graph from the state
initial_graph = {}
initial_graph["source"] = state["dic_source_graph"][-1]  # The last source graph as of now
# logger.log(logging.INFO, "Source graph: %s", source_graph)

# Load the knowledge graph
with open(initial_graph["source"]["kg_pyg_path"], "rb") as f:
    initial_graph["pyg"] = pickle.load(f)
# with open(initial_graph["source"]["kg_text_path"], "rb") as f:
#     initial_graph["text"] = pickle.load(f)

pyg_graph = initial_graph["pyg"]

In [77]:
prompt_emb = [EmbeddingWithOllama(model_name=cfg.ollama_embeddings[0]).embed_query(prompt)]

INFO:httpx:HTTP Request: GET http://127.0.0.1:11434/api/tags "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


In [78]:
# Load the data from the parquet files
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files'
nodes_df = cudf.read_parquet(os.path.join(local_dir, 'biobridge_nodes.parquet.gzip'))
edges_df = cudf.read_parquet(os.path.join(local_dir, 'biobridge_edges.parquet.gzip'))

graph_nodes = nodes_df.copy()
graph_edges = edges_df.copy()

In [79]:
# Initialize logger
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [80]:
# Initialize dataframes
logger.log(logging.INFO, "Initializing dataframes")
multimodal_df = cudf.DataFrame({"name": [], "node_type": []})
query_df = cudf.DataFrame({"node_id": [],
                            "node_type": [],
                            "x": [],
                            "desc_x": [],
                            "use_description": []})

# Loop over the uploaded files and find multimodal files
logger.log(logging.INFO, "Looping over uploaded files")
for i in range(len(state["uploaded_files"])):
    # Check if multimodal file is uploaded
    if state["uploaded_files"][i]["file_type"] == "multimodal":
        # Read the csv file
        multimodal_df = cudf.read_csv(state["uploaded_files"][i]["file_path"])

# Check if the multimodal_df is empty
logger.log(logging.INFO, "Checking if multimodal_df is empty")
if len(multimodal_df) > 0:
    # Prepare multimodal_df
    logger.log(logging.INFO, "Preparing multimodal_df")
    multimodal_df.rename(columns={"name": "q_node_name",
                                    "node_type": "q_node_type"}, inplace=True)

    # Make and process a query dataframe by merging the graph_df and multimodal_df
    logger.log(logging.INFO, "Processing query dataframe")
    query_df = graph_nodes[
        ['node_id', 'node_name', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']
    ].merge(multimodal_df, how='cross')
    logger.log(logging.INFO, "Lowering case for node names (q_node_name)")
    query_df['q_node_name'] = query_df['q_node_name'].str.lower()
    logger.log(logging.INFO, "Lowering case for node names (node_name)")
    query_df['node_name'] = query_df['node_name'].str.lower()
    # Get the mask for filtering based on the query
    logger.log(logging.INFO, "Filtering based on the query")
    mask = (
        query_df['node_name'].str.contains(query_df['q_node_name']) &
        (query_df['node_type'] == query_df['q_node_type'])
    )
    query_df = query_df[mask]
    query_df = query_df[['node_id',
                            'node_type', 
                            'enriched_node', 
                            'x', 
                            'desc', 
                            'desc_x']].reset_index(drop=True)
    query_df['use_description'] = False # set to False for modal-specific embeddings

    # Update the state by adding the the selected node IDs
    logger.log(logging.INFO, "Updating state with selected node IDs")
    state["selections"] = query_df.to_pandas().groupby(
        "node_type"
    )["node_id"].apply(list).to_dict()

# Append a user prompt to the query dataframe
logger.log(logging.INFO, "Adding user prompt to query dataframe")
query_df = cudf.concat([
    query_df,
    cudf.DataFrame({
        'node_id': 'user_prompt',
        'node_type': 'prompt',
        # 'enriched_node': prompt,
        'x': prompt_emb,
        # 'desc': prompt,
        'desc_x': prompt_emb,
        'use_description': True # set to True for user prompt embedding
    })
]).reset_index(drop=True)

INFO:__main__:Initializing dataframes
INFO:__main__:Looping over uploaded files
INFO:__main__:Checking if multimodal_df is empty
INFO:__main__:Preparing multimodal_df
INFO:__main__:Processing query dataframe
INFO:__main__:Lowering case for node names (q_node_name)
INFO:__main__:Lowering case for node names (node_name)
INFO:__main__:Filtering based on the query
INFO:__main__:Updating state with selected node IDs
INFO:__main__:Adding user prompt to query dataframe


In [81]:
from aiagents4pharma.talk2knowledgegraphs.utils.extractions.cu_multimodal_pcst import MultimodalPCSTPruning
graph = {}
graph["nodes"] = nodes_df
graph["edges"] = edges_df

In [82]:
query_df

Unnamed: 0,node_id,node_type,enriched_node,x,desc,desc_x,use_description
0,IL7R_(625),gene/protein,MTILGTTFGMVFSLLQVVSGESGYAQNGDLEDAELDDYSFSCYSQL...,"[0.06364653259515762, 0.06951971352100372, 0.0...",IL7R belongs to gene/protein node. IL7R is int...,"[0.04506902, 0.008911126, -0.17318207, -0.0157...",False
1,TCF7_(5195),gene/protein,MPQLDSGGGGAGGGDDLGAPDELLAFQDEGEEQDDKSRDSAAGPER...,"[0.028321774676442146, 0.003539376426488161, 0...",TCF7 belongs to gene/protein node. TCF7 is tra...,"[0.036997586, 0.038098544, -0.19027671, -0.006...",False
2,user_prompt,prompt,,"[0.045646045, 0.016633496, -0.14382866, -0.020...",,"[0.045646045, 0.016633496, -0.14382866, -0.020...",True


In [83]:
# Initialize the subgraph dictionary
subgraphs = {}
subgraphs["nodes"] = []
subgraphs["edges"] = []

# Loop over query embeddings and modalities
for q in query_df.to_pandas().iterrows():
    # Prepare the PCSTPruning object and extract the subgraph
    # Parameters were set in the configuration file obtained from Hydra
    subgraph = MultimodalPCSTPruning(
        topk=state["topk_nodes"],
        topk_e=state["topk_edges"],
        cost_e=cfg.cost_e,
        c_const=cfg.c_const,
        root=cfg.root,
        num_clusters=cfg.num_clusters,
        pruning=cfg.pruning,
        verbosity_level=cfg.verbosity_level,
        use_description=q[1]['use_description'],
    ).extract_subgraph(graph,
                       cp.array(q[1]['desc_x']).reshape(1, -1).astype(cp.float32),
                       cp.array(q[1]['x']).reshape(1, -1).astype(cp.float32),
                       q[1]['node_type'])

    # Append the extracted subgraph to the dictionary
    subgraphs["nodes"].append(subgraph["nodes"].tolist())
    subgraphs["edges"].append(subgraph["edges"].tolist())

# Concatenate and get unique node and edge indices
subgraphs["nodes"] = np.unique(
    np.concatenate([np.array(list_) for list_ in subgraphs["nodes"]])
)
subgraphs["edges"] = np.unique(
    np.concatenate([np.array(list_) for list_ in subgraphs["edges"]])
)

In [84]:
subgraphs

{'nodes': array([   5,    9,   12,   15,   22,   24,   27,   30,   49,   54,   72,
          82,  264,  547,  610,  706,  708,  741,  742,  743,  840,  845,
         846,  854,  855, 1320, 1415, 1804, 1863, 1952, 2102, 2263, 2479,
        2779, 2827]),
 'edges': array([ 123,  125,  167,  176,  288,  291,  888,  920,  921,  942, 1004,
        1064, 1083, 1088, 1191, 1216, 1268, 1650, 1760, 1846, 2486, 4005,
        4557, 4558, 4958, 5110, 5801, 5802, 5803, 5958, 6613, 6762, 7397,
        7400, 9642])}

In [85]:
subgraph = subgraphs

In [86]:
subgraph["nodes"].shape

(35,)

In [87]:
# Convert the dict to a cudf DataFrame
node_colors = {n: cfg.node_colors_dict[k]
                for k, v in state["selections"].items() for n in v}
color_df = cudf.DataFrame(list(node_colors.items()), columns=["node_id", "color"])

# Prepare graph dataframes
# Nodes
graph_nodes = graph["nodes"].copy()
graph_nodes = graph_nodes.iloc[subgraph['nodes']][
    ['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']
]
graph_nodes = graph_nodes.merge(color_df, on="node_id", how="left")
# Edges
graph_edges = graph["edges"].copy()
graph_edges = graph_edges.iloc[subgraph['edges']][
    ['head_id', 'tail_id', 'edge_type']
]

# Prepare lists for visualization
graph_dict = {}
graph_dict["nodes"] = [(
    row.node_id,
    {'desc': row.desc,
        'node_type': row.node_type,
        'node_name': row.node_name,
        'enriched_node': row.enriched_node,
        'color': row.color})
        for row in graph_nodes.to_arrow().to_pandas().itertuples(index=False)]
graph_dict["edges"] = [(
    row.head_id, 
    row.tail_id,
    {'label': tuple(row.edge_type)})
    for row in graph_edges.to_arrow().to_pandas().itertuples(index=False)]

# Prepare the textualized subgraph
graph_dict["text"] = (
    graph_nodes[
        ['node_id', 'desc']
    ].rename(columns={'desc': 'node_attr'}).to_arrow().to_pandas().to_csv(index=False)
    + "\n"
    + graph_edges[
        ['head_id', 'edge_type', 'tail_id']
    ].to_arrow().to_pandas().to_csv(index=False)
)

In [88]:
graph_nodes['color'].fillna('black', inplace=True)
graph_nodes

Unnamed: 0,node_id,node_name,node_type,desc,enriched_node,color
0,"(4S,5E,7Z,10Z,13Z,16Z,19Z)-4-hydroxydocosa-5,7...","(4S,5E,7Z,10Z,13Z,16Z,19Z)-4-hydroxydocosa-5,7...",drug,"(4S,5E,7Z,10Z,13Z,16Z,19Z)-4-hydroxydocosa-5,7...",[H][C@](O)(CCC(O)=O)\C=C\C=C/C\C=C/C\C=C/C\C=C...,black
1,Talmapimod_(17589),Talmapimod,drug,Talmapimod belongs to drug node. Talmapimod is...,[H][C@]1(C)CN(C(=O)C2=C(Cl)C=C3N(C)C=C(C(=O)C(...,black
2,VX-702_(17590),VX-702,drug,VX-702 belongs to drug node. VX-702 is a small...,NC(=O)N(C1=CC=C(C(N)=O)C(=N1)C1=CC=C(F)C=C1F)C...,black
3,Atiprimod_(17591),Atiprimod,drug,Atiprimod belongs to drug node. Investigat...,CCCC1(CCC)CCC2(CCN(CCCN(CC)CC)C2)CC1,black
4,inflammatory bowel disease_(28158),inflammatory bowel disease,disease,inflammatory bowel disease belongs to disease ...,Any inflammatory bowel disease in which the ca...,black
5,IL23R_(34778),IL23R,gene/protein,IL23R belongs to gene/protein node. IL23R is i...,MNQVTIQWDAVIALYILFSWCHGGITNINCSGHIWVEPATIFKMGM...,black
6,NKX2-3_(34779),NKX2-3,gene/protein,NKX2-3 belongs to gene/protein node. NKX2-3 is...,MMLPSPVTSTPFSVKDILNLEQQHQHFHGAHLQADLEHHFHSAPCM...,black
7,Crohn disease_(37784),Crohn disease,disease,Crohn disease belongs to disease node. A gastr...,A gastrointestinal disorder characterized by c...,black
8,ulcerative colitis (disease)_(37785),ulcerative colitis (disease),disease,ulcerative colitis (disease) belongs to diseas...,An inflammatory bowel disease involving the mu...,black
9,regulation of gamma-delta T cell differentiati...,regulation of gamma-delta T cell differentiation,biological_process,regulation of gamma-delta T cell differentiati...,"Any process that modulates the frequency, rate...",black


In [70]:
graph_nodes[graph_nodes.node_name=='TNF']

Unnamed: 0,node_id,node_name,node_type,desc,enriched_node,color
53,TNF_(2329),TNF,gene/protein,TNF belongs to gene/protein node. TNF is tumor...,MSTESMIRDVELAEEALPKKTGGPQGSRRCLFLSLFSFLIVAGATT...,black


In [71]:
graph_dict["nodes"]

[('KIF21B_(8564)',
  {'desc': 'KIF21B belongs to gene/protein node. KIF21B is kinesin family member 21B. This gene encodes a member of the kinesin superfamily. Kinesins are ATP-dependent microtubule-based motor proteins that are involved in the intracellular transport of membranous organelles. Single nucleotide polymorphisms in this gene are associated with inflammatory bowel disease and multiple sclerosis. Alternatively spliced transcript variants encoding multiple isoforms have been observed for this gene. [provided by RefSeq, Nov 2011].',
   'node_type': 'gene/protein',
   'node_name': 'KIF21B',
   'enriched_node': 'MAGQGDCCVKVAVRIRPQLSKEKIEGCHICTSVTPGEPQVLLGKDKAFTYDFVFDLDTWQEQIYSTCVSKLIEGCFEGYNATVLAYGQTGAGKTYTMGTGFDMATSEEEQGIIPRAIAHLFGGIAERKRRAQEQGVAGPEFKVSAQFLELYNEEILDLFDSTRDPDTRHRRSNIKIHEDANGGIYTTGVTSRLIHSQEELIQCLKQGALSRTTASTQMNVQSSRSHAIFTIHLCQMRMCTQPDLVNEAVTGLPDGTPPSSEYETLTAKFHFVDLAGSERLKRTGATGERAKEGISINCGLLALGNVISALGDQSKKVVHVPYRDSKLTRLLQDSLGGNSQTIMIACVSPSDRDFMETLNTLKYANRARNIKNK

In [26]:
graph_nodes

Unnamed: 0,node_index,node_name,node_id,node_type,desc,desc_x,enriched_node,x,color
0,1296,negative regulation of calcium ion transport,negative regulation of calcium ion transport_(...,biological_process,negative regulation of calcium ion transport b...,"[0.06554254, 0.081316106, -0.1285265, 0.049076...","Any process that stops, prevents, or reduces t...","[0.087609835, 0.070211984, -0.14004038, 0.0102...",
1,1297,vasoconstriction,vasoconstriction_(45312),biological_process,vasoconstriction belongs to biological_process...,"[0.09431399, 0.04267256, -0.14150876, -0.01277...","A decrease in the diameter of blood vessels, e...","[0.124152906, 0.08743669, -0.14511469, 0.00278...",
2,1298,positive regulation of vascular permeability,positive regulation of vascular permeability_(...,biological_process,positive regulation of vascular permeability b...,"[0.07449961, 0.05626029, -0.166832, 0.00161353...",Any process that increases the extent to which...,"[0.08488041, 0.08616824, -0.16508923, -0.05333...",
3,1299,negative regulation of cation channel activity,negative regulation of cation channel activity...,biological_process,negative regulation of cation channel activity...,"[0.052357886, 0.057002943, -0.13946539, 0.0354...","Any process that stops, prevents or reduces th...","[0.07234384, 0.062300272, -0.14855722, -0.0117...",
4,1300,regulation of cell cycle,regulation of cell cycle_(45381),biological_process,regulation of cell cycle belongs to biological...,"[0.05315926, 0.031067748, -0.14542376, 0.02276...",Any process that modulates the rate or extent ...,"[0.049430866, 0.04014543, -0.15489803, 0.00624...",
...,...,...,...,...,...,...,...,...,...
2986,11,PPARA,PPARA_(1122),gene/protein,PPARA belongs to gene/protein node. PPARA is p...,"[0.036692616, 0.03641927, -0.15898165, 0.00553...",MVDTESPLCPLSPLEAGDLESPLSEEFLQEMGNIQEISQSIGEDSS...,"[-0.00225809495896101, 0.03649000823497772, 0....",
2987,12,IL10RA,IL10RA_(1299),gene/protein,IL10RA belongs to gene/protein node. IL10RA is...,"[0.01976429, 0.020241434, -0.16177836, -0.0315...",MLPCLVVLLAALLSLRLGSDAHGTELPSPPSVWFEAEFFHHILHWT...,"[0.01952783204615116, 0.059050511568784714, 0....",
2988,13,ADIPOQ,ADIPOQ_(1480),gene/protein,ADIPOQ belongs to gene/protein node. ADIPOQ is...,"[0.06728461, 0.036462087, -0.14274466, -0.0324...",MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGH...,"[-0.02511977031826973, -0.12880179286003113, -...",
2989,14,IL6,IL6_(1567),gene/protein,IL6 belongs to gene/protein node. IL6 is inter...,"[0.0419631, 0.02979343, -0.17528865, -0.003821...",MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGEDSKDVAAPHRQP...,"[0.05400313809514046, 0.00949622318148613, 0.0...",


In [74]:
print(graph_dict['text'])

node_id,node_attr
KIF21B_(8564),"KIF21B belongs to gene/protein node. KIF21B is kinesin family member 21B. This gene encodes a member of the kinesin superfamily. Kinesins are ATP-dependent microtubule-based motor proteins that are involved in the intracellular transport of membranous organelles. Single nucleotide polymorphisms in this gene are associated with inflammatory bowel disease and multiple sclerosis. Alternatively spliced transcript variants encoding multiple isoforms have been observed for this gene. [provided by RefSeq, Nov 2011]."
INAVA_(9104),"INAVA belongs to gene/protein node. INAVA is innate immunity activator. Involved in several processes, including nucleotide-binding activity oligomerization domain containing 2 signaling pathway; positive regulation of cytokine production; and positive regulation of intracellular signal transduction. Located in cytoplasm and nucleus. Implicated in inflammatory bowel disease 29. [provided by Alliance of Genome Resources, Apr 2022]"
IL

In [21]:
graph_edges

Unnamed: 0,head_id,tail_id,edge_type
10757,macrophage derived foam cell differentiation_(...,TGFB1_(2889),"[biological_process, interacts with, gene/prot..."
10508,positive regulation of tyrosine phosphorylatio...,IL21_(34967),"[biological_process, interacts with, gene/prot..."
4864,IFNG_(3495),positive regulation of tyrosine phosphorylatio...,"[gene/protein, interacts with, biological_proc..."
5120,PPARG_(989),macrophage derived foam cell differentiation_(...,"[gene/protein, interacts with, biological_proc..."
1190,TGFB1_(2889),"inflammatory bowel disease, immunodeficiency, ...","[gene/protein, associated with, disease]"
1192,IL21_(34967),IL21-related infantile inflammatory bowel dise...,"[gene/protein, associated with, disease]"
6769,Crohn disease_(37784),NKX2-3_(34779),"[disease, associated with, gene/protein]"
6812,Crohn ileitis and jejunitis_(35814),ATG16L1_(6661),"[disease, associated with, gene/protein]"
6555,inflammatory bowel disease_(28158),IL1B_(1004),"[disease, associated with, gene/protein]"
993,ICOSLG_(9454),inflammatory bowel disease_(28158),"[gene/protein, associated with, disease]"


In [17]:
graph_dict["nodes"]

[('protein kinase A binding_(54006)',
  {'desc': 'protein kinase A binding belongs to molecular_function node. ',
   'node_type': 'molecular_function',
   'node_name': 'protein kinase A binding',
   'enriched_node': 'Binding to a protein kinase A.'}),
 ('RNA polymerase II-specific DNA-binding transcription factor binding_(54015)',
  {'desc': 'RNA polymerase II-specific DNA-binding transcription factor binding belongs to molecular_function node. ',
   'node_type': 'molecular_function',
   'node_name': 'RNA polymerase II-specific DNA-binding transcription factor binding',
   'enriched_node': 'Binding to a sequence-specific DNA binding RNA polymerase II transcription factor, any of the factors that interact selectively and non-covalently with a specific DNA sequence in order to modulate transcription.'}),
 ('enzyme binding_(54290)',
  {'desc': 'enzyme binding belongs to molecular_function node. ',
   'node_type': 'molecular_function',
   'node_name': 'enzyme binding',
   'enriched_node': 

In [15]:
query_df

Unnamed: 0,node_id,node_type,enriched_node,x,desc,desc_x,use_description
0,IL7R_(625),gene/protein,MTILGTTFGMVFSLLQVVSGESGYAQNGDLEDAELDDYSFSCYSQL...,"[0.06364653259515762, 0.06951971352100372, 0.0...",IL7R belongs to gene/protein node. IL7R is int...,"[0.04506902, 0.008911126, -0.17318207, -0.0157...",False
1,TCF7_(5195),gene/protein,MPQLDSGGGGAGGGDDLGAPDELLAFQDEGEEQDDKSRDSAAGPER...,"[0.028321774676442146, 0.003539376426488161, 0...",TCF7 belongs to gene/protein node. TCF7 is tra...,"[0.036997586, 0.038098544, -0.19027671, -0.006...",False
2,user_prompt,prompt,,"[0.047626022, 0.05007147, -0.16455309, -0.0442...",,"[0.047626022, 0.05007147, -0.16455309, -0.0442...",True


In [None]:
from cuvs.distance import pairwise_distance
def _compute_sim_scores(features_a: cp.ndarray,
                            features_b:  cp.ndarray,
                            metric: str="cosine"):
        """
        Compute the similarity scores between two sets of features using the specified metric.

        Args:
            features_a: The first set of features.
            features_b: The second set of features.
            metric: The metric to use for computing the similarity scores.
        
        Returns:
            The similarity scores between the two sets of features.
        """
        scores = pairwise_distance(features_a, features_b, metric=metric)
        scores = 1 - cp.asarray(scores).ravel()
        return scores

In [None]:
query_emb = cp.array(query_df.iloc[2]['x'][2]).reshape(1, -1).astype(cp.float32)
query_emb.shape

In [None]:
sim_scores = cudf.Series(cp.zeros(len(graph_nodes), dtype=cp.float32))
sim_scores[:] = _compute_sim_scores(
                graph_nodes["desc_x"].list.leaves.to_cupy().reshape(
                    -1, len(graph_nodes["desc_x"][0])
                ).astype(cp.float32),
                query_emb
            )  # shape [N, 1]
sim_scores

In [None]:
isinstance(sim_scores, cudf.Series)

In [None]:
# Initialize dataframes
multimodal_df = cudf.DataFrame({"name": [], "node_type": []})
query_df = cudf.DataFrame({"node_id": [],
                            "node_type": [],
                            "x": [],
                            "desc_x": [],
                            "use_description": []})

# Loop over the uploaded files and find multimodal files
for i in range(len(state["uploaded_files"])):
    # Check if multimodal file is uploaded
    if state["uploaded_files"][i]["file_type"] == "multimodal":
        # Read the Excel file
        multimodal_df = cudf.read_csv(state["uploaded_files"][i]["file_path"])

# Check if the multimodal_df is empty
if len(multimodal_df) > 0:
    # Merge all obtained dataframes into a single dataframe
    multimodal_df.rename(columns={"name": "q_node_name", "node_type": "q_node_type"}, inplace=True)

    # Make and process a query dataframe by merging the graph_df and multimodal_df
    query_df = nodes_df[['node_id', 'node_name', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']].merge(multimodal_df, how='cross')
    query_df['q_node_name'] = query_df['q_node_name'].str.lower()
    query_df['node_name'] = query_df['node_name'].str.lower()
    # Get the mask for filtering based on the query
    mask = (
        query_df['node_name'].str.contains(query_df['q_node_name']) &
        (query_df['node_type'] == query_df['q_node_type'])
    )
    query_df = query_df[mask]
    query_df = query_df[['node_id', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']].reset_index(drop=True)
    query_df['use_description'] = False # set to False for modal-specific embeddings

    # Update the state by adding the the selected node IDs
    state["selections"] = query_df.to_pandas().groupby("node_type")["node_id"].apply(list).to_dict()

# Append a user prompt to the query dataframe
query_df = cudf.concat([
    query_df,
    cudf.DataFrame({
        'node_id': 'user_prompt',
        'node_type': 'prompt',
        # 'enriched_node': prompt,
        'x': prompt_emb,
        # 'desc': prompt,
        'desc_x': prompt_emb,
        'use_description': True # set to True for user prompt embedding
    })
]).reset_index(drop=True)


### Before

In [None]:
from torch_geometric.data import Data

topk = state["topk_nodes"]  
topk_e = state["topk_edges"]
c_const = 0.01

def _compute_node_prizes(graph: Data,
                         query_emb: torch.Tensor,
                         modality: str,
                         use_description: bool=False) :
    """
    Compute the node prizes based on the cosine similarity between the query and nodes.

    Args:
        graph: The knowledge graph in PyTorch Geometric Data format.
        query_emb: The query embedding in PyTorch Tensor format. This can be an embedding of
            a prompt, sequence, or any other feature to be used for the subgraph extraction.
        modality: The modality to use for the subgraph extraction based on the node type.

    Returns:
        The prizes of the nodes.
    """
    # Convert PyG graph to a DataFrame
    graph_df = cudf.DataFrame({
        "node_type": graph.node_type,
        "desc_x": [x.tolist() for x in graph.desc_x],
        "x": [list(x) for x in graph.x],
        "score": [0.0 for _ in range(len(graph.node_id))],
    })

    # Calculate cosine similarity for text features and update the score
    if use_description:
        graph_df.loc[:, "score"] = torch.nn.CosineSimilarity(dim=-1)(
                query_emb,
                torch.tensor(list(graph_df.desc_x.values)) # Using textual description features
            ).tolist()
    else:
        graph_df.loc[graph_df["node_type"] == modality,
                        "score"] = torch.nn.CosineSimilarity(dim=-1)(
                query_emb,
                torch.tensor(list(graph_df[graph_df["node_type"]== modality].x.values))
            ).tolist()

    # Set the prizes for nodes based on the similarity scores
    n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
    # n_prizes = torch.nn.CosineSimilarity(dim=-1)(query_emb, graph.x)
    topk = min(topk, graph.num_nodes)
    _, topk_n_indices = torch.topk(n_prizes, topk, largest=True)
    n_prizes = torch.zeros_like(n_prizes)
    n_prizes[topk_n_indices] = torch.arange(topk, 0, -1).float()

    return n_prizes

def _compute_edge_prizes(graph: Data,
                         text_emb: torch.Tensor) :
    """
    Compute the node prizes based on the cosine similarity between the query and nodes.

    Args:
        graph: The knowledge graph in PyTorch Geometric Data format.
        text_emb: The textual description embedding in PyTorch Tensor format.

    Returns:
        The prizes of the nodes.
    """
    # Note that as of now, the edge features are based on textual features
    # Compute prizes for edges
    e_prizes = torch.nn.CosineSimilarity(dim=-1)(text_emb, graph.edge_attr)
    unique_prizes, inverse_indices = e_prizes.unique(return_inverse=True)
    topk_e = min(topk_e, unique_prizes.size(0))
    topk_e_values, _ = torch.topk(unique_prizes, topk_e, largest=True)
    e_prizes[e_prizes < topk_e_values[-1]] = 0.0
    last_topk_e_value = topk_e
    for k in range(topk_e):
        indices = inverse_indices == (
            unique_prizes == topk_e_values[k]
        ).nonzero(as_tuple=True)[0]
        value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
        e_prizes[indices] = value
        last_topk_e_value = value * (1 - c_const)

    return e_prizes

In [None]:
import pandas as pd

graph = initial_graph["pyg"]
text_emb = torch.tensor(query_df.iloc[0]['desc_x'][0])
query_emb = torch.tensor(query_df.iloc[0]['x'][0])
modality = query_df.iloc[0]['node_type'][0]

# Convert PyG graph to a DataFrame
graph_df = pd.DataFrame({
    "node_type": graph.node_type,
    "desc_x": [x.tolist() for x in graph.desc_x],
    "x": [list(x) for x in graph.x],
    "score": [0.0 for _ in range(len(graph.node_id))],
})

graph_df.loc[graph_df["node_type"] == modality, "score"]  = torch.nn.CosineSimilarity(dim=-1)(
        query_emb,
        torch.tensor(list(graph_df[graph_df["node_type"]== modality].x.values))
    ).tolist()

In [None]:
# Set the prizes for nodes based on the similarity scores
n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
topk = min(topk, graph.num_nodes)
_, topk_n_indices = torch.topk(n_prizes, topk, largest=True)
n_prizes = torch.zeros_like(n_prizes)
n_prizes[topk_n_indices] = torch.arange(topk, 0, -1).float()
n_prizes

In [None]:
graph_edges = edges_df
c_const = cfg.c_const

e_prizes = torch.nn.CosineSimilarity(dim=-1)(torch.tensor(text_emb), torch.tensor([list(f) for f in graph_edges.edge_attr.to_arrow().to_pandas().values]))
unique_prizes, inverse_indices = e_prizes.unique(return_inverse=True)
topk_e = min(topk_e, unique_prizes.size(0))
topk_e_values, _ = torch.topk(unique_prizes, topk_e, largest=True)
e_prizes[e_prizes < topk_e_values[-1]] = 0.0
last_topk_e_value = topk_e
for k in range(topk_e):
    print(k, (unique_prizes == topk_e_values[k]).nonzero())
    indices = inverse_indices == (
        unique_prizes == topk_e_values[k]
    ).nonzero(as_tuple=True)[0]
    value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
    e_prizes[indices] = value
    last_topk_e_value = value * (1 - c_const)


In [None]:
prizes = {"nodes": n_prizes, "edges": e_prizes}
cost_e = cfg.cost_e
c_const = cfg.c_const

# Logic to reduce the cost of the edges such that at least one edge is selected
updated_cost_e = min(
    cost_e,
    prizes["edges"].max().item() * (1 - c_const / 2),
)

# Initialize variables
edges = []
costs = []
virtual = {
    "n_prizes": [],
    "edges": [],
    "costs": [],
}
mapping = {"nodes": {}, "edges": {}}

# Compute the costs, edges, and virtual variables based on the prizes
for i, (src, dst) in enumerate(graph.edge_index.T.numpy()):
    prize_e = prizes["edges"][i]
    if prize_e <= updated_cost_e:
        mapping["edges"][len(edges)] = i
        edges.append((src, dst))
        costs.append(updated_cost_e - prize_e)
    else:
        virtual_node_id = graph.num_nodes + len(virtual["n_prizes"])
        mapping["nodes"][virtual_node_id] = i
        virtual["edges"].append((src, virtual_node_id))
        virtual["edges"].append((virtual_node_id, dst))
        virtual["costs"].append(0)
        virtual["costs"].append(0)
        virtual["n_prizes"].append(prize_e - updated_cost_e)
prizes = np.concatenate([prizes["nodes"], np.array(virtual["n_prizes"])])
edges_dict = {}
edges_dict["edges"] = edges
edges_dict["num_prior_edges"] = len(edges)
# Final computation of the costs and edges based on the virtual costs and virtual edges
if len(virtual["costs"]) > 0:
    costs = np.array(costs + virtual["costs"])
    edges = np.array(edges + virtual["edges"])
    edges_dict["edges"] = edges


In [None]:
np.unique(prizes)

In [None]:
cp.unique(prizes)

In [None]:
edges_dict["edges"]

In [None]:
costs + virtual["costs"]

In [None]:
len(virtual["costs"])

In [None]:
len(virtual["costs"])

In [None]:
costs

In [None]:
e_prizes.nonzero()

In [None]:
e_prizes[:10]

### After

In [None]:
# Initialize dataframes
multimodal_df = cudf.DataFrame({"name": [], "node_type": []})
query_df = cudf.DataFrame({"node_id": [],
                            "node_type": [],
                            "x": [],
                            "desc_x": [],
                            "use_description": []})

# Loop over the uploaded files and find multimodal files
for i in range(len(state["uploaded_files"])):
    # Check if multimodal file is uploaded
    if state["uploaded_files"][i]["file_type"] == "multimodal":
        # Read the csv file
        multimodal_df = cudf.read_csv(state["uploaded_files"][i]["file_path"])

# Check if the multimodal_df is empty
if len(multimodal_df) > 0:
    # Prepare multimodal_df
    multimodal_df.rename(columns={"name": "q_node_name",
                                    "node_type": "q_node_type"}, inplace=True)

    # Make and process a query dataframe by merging the graph_df and multimodal_df
    query_df = graph_nodes[
        ['node_id', 'node_name', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']
    ].merge(multimodal_df, how='cross')
#     query_df['q_node_name'] = query_df['q_node_name'].str.lower()
#     query_df['node_name'] = query_df['node_name'].str.lower()
#     # Get the mask for filtering based on the query
#     mask = (
#         query_df['node_name'].str.contains(query_df['q_node_name']) &
#         (query_df['node_type'] == query_df['q_node_type'])
#     )
#     query_df = query_df[mask]
#     query_df = query_df[['node_id',
#                             'node_type', 
#                             'enriched_node', 
#                             'x', 
#                             'desc', 
#                             'desc_x']].reset_index(drop=True)
#     query_df['use_description'] = False # set to False for modal-specific embeddings

#     # Update the state by adding the the selected node IDs
#     state["selections"] = query_df.to_pandas().groupby(
#         "node_type"
#     )["node_id"].apply(list).to_dict()

# # Append a user prompt to the query dataframe
# query_df = cudf.concat([
#     query_df,
#     cudf.DataFrame({
#         'node_id': 'user_prompt',
#         'node_type': 'prompt',
#         # 'enriched_node': prompt,
#         'x': prompt_emb,
#         # 'desc': prompt,
#         'desc_x': prompt_emb,
#         'use_description': True # set to True for user prompt embedding
#     })
# ]).reset_index(drop=True)

In [None]:
query_df

In [None]:
def _compute_sim_scores(features_a, features_b, metric="cosine"):
    scores = cuvs.distance.pairwise_distance(features_a, features_b, metric=metric)
    scores = 1 - cp.asarray(scores).ravel()
    return scores

In [None]:
graph_nodes = nodes_df
graph_edges = edges_df
topk = cfg.topk
topk_e = cfg.topk_e
use_description = False
# query_emb = torch.tensor(query_df.iloc[0]['x'][0]) # torch.Size([2560])
text_emb = cp.array(query_df.iloc[1]['desc_x'][1]).reshape(1, -1).astype(cp.float32)
query_emb = cp.array(query_df.iloc[1]['x'][1]).reshape(1, -1).astype(cp.float32)

# Initialize variables
sim_scores = cudf.Series(cp.zeros(len(graph_nodes), dtype=cp.float32))
mask = (graph_nodes.node_type == modality)

# Calculate cosine similarity for text features and update the score
if use_description:
    sim_scores = _compute_sim_scores(
        graph_nodes["desc_x"].list.leaves.to_cupy().reshape(-1, len(graph_nodes["desc_x"][0])).astype(cp.float32),
        query_emb
    )  # shape [N, 1]
else:
    sim_scores[mask] = _compute_sim_scores(
        graph_nodes[mask]["x"].list.leaves.to_cupy().reshape(-1, len(graph_nodes[mask]["x"][0])).astype(cp.float32),
        query_emb
    )  # shape [N, 1]

# Set the prizes for nodes based on the similarity scores
# n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
topk = min(topk, sim_scores.size)
n_prizes = cudf.Series(0.0, index=cp.arange(sim_scores.size))
n_prizes[(-sim_scores).sort_values()[:topk].index] = cp.arange(topk, 0, -1).astype(cp.float32)
n_prizes = n_prizes.to_cupy()

In [None]:
# sim_scores = cudf.Series(cp.zeros(len(graph_edges), dtype=cp.float32))

e_prizes = _compute_sim_scores(
    graph_edges["edge_attr"].list.leaves.to_cupy().reshape(-1, len(graph_edges["edge_attr"][0])).astype(cp.float32),
    text_emb)

unique_prizes, inverse_indices = cp.unique(e_prizes, return_inverse=True)
topk_e = min(topk_e, sim_scores.size) 
topk_e_values = unique_prizes[cp.argsort(-unique_prizes)[:topk_e]]
e_prizes[e_prizes < topk_e_values[-1]] = 0.0
last_topk_e_value = topk_e
for k in range(topk_e):
    indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
    value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
    e_prizes[indices] = value
    last_topk_e_value = value * (1 - c_const)


In [None]:
prizes = {"nodes": n_prizes, "edges": e_prizes}

In [None]:
cost_e = cfg.cost_e

# Logic to reduce the cost of the edges such that at least one edge is selected
updated_cost_e = min(
    cost_e,
    prizes["edges"].max().item() * (1 - c_const / 2),
)

In [None]:
# def _create_edge_index(graph_nodes, graph_edges):
#     # Create and additional node_index column
#     graph_nodes = graph_nodes.reset_index(drop=True)
#     graph_nodes['node_index'] = graph_nodes.index

#     # Get head_index and tail_index
#     edges = graph_edges.merge(graph_nodes[['node_id', 'node_index']],
#                             left_on='head_id', right_on='node_id',
#                             how='left').rename(columns={'node_index': 'head_index'}).drop(columns=['node_id'])
#     edges = edges.merge(graph_nodes[['node_id', 'node_index']],
#                         left_on='tail_id', right_on='node_id',
#                         how='left').rename(columns={'node_index': 'tail_index'}).drop(columns=['node_id'])

#     # Stacking to get into edge_index
#     edge_index = cp.stack([
#         edges['head_index'].to_cupy(),
#         edges['tail_index'].to_cupy()
#     ])

#     return edge_index

In [None]:
edge_index = cp.stack([
    graph_edges['head_index'].to_cupy(),
    graph_edges['tail_index'].to_cupy()
])
edge_index

In [None]:
prizes = {"nodes": n_prizes, "edges": e_prizes}

cost_e = cfg.cost_e

# Logic to reduce the cost of the edges such that at least one edge is selected
updated_cost_e = min(
    cost_e,
    prizes["edges"].max().item() * (1 - c_const / 2),
)

# Initialize variables
edges = []
costs = []
virtual = {
    "n_prizes": [],
    "edges": [],
    "costs": [],
}
mapping = {"nodes": {}, "edges": {}}

# Compute the costs, edges, and virtual variables based on the prizes
for i, (src, dst) in enumerate(edge_index.T):
    prize_e = prizes["edges"][i].item()
    if prize_e <= updated_cost_e:
        mapping["edges"][len(edges)] = i
        edges.append((src.item(), dst.item()))
        costs.append(updated_cost_e - prize_e)
    else:
        virtual_node_id = graph_nodes.shape[0] + len(virtual["n_prizes"])
        mapping["nodes"][virtual_node_id] = i
        virtual["edges"].append((src.item(), virtual_node_id))
        virtual["edges"].append((virtual_node_id, dst.item()))
        virtual["costs"].append(0)
        virtual["costs"].append(0)
        virtual["n_prizes"].append(prize_e - updated_cost_e)
prizes = cp.concatenate([prizes["nodes"], cp.array(virtual["n_prizes"])])
edges_dict = {}
edges_dict["edges"] = edges
edges_dict["num_prior_edges"] = len(edges)
# Final computation of the costs and edges based on the virtual costs and virtual edges
if len(virtual["costs"]) > 0:
    costs = cp.array(costs + virtual["costs"])
    edges = cp.array(edges + virtual["edges"])
    edges_dict["edges"] = edges

In [None]:
import pcst_fast
root = cfg.root
num_clusters = cfg.num_clusters
pruning = cfg.pruning
verbosity_level = cfg.verbosity_level

# Retrieve the subgraph using the PCST algorithm
result_vertices, result_edges = pcst_fast.pcst_fast(
    edges_dict["edges"].get(),
    prizes.get(),
    costs.get(),
    root,
    num_clusters,
    pruning,
    verbosity_level,
)

In [None]:
# edge_index = _create_edge_index(graph_nodes, graph_edges)
vertices = cp.asarray(result_vertices)
edges_dict = {"edges": cp.asarray(result_edges), 
              "num_prior_edges": edges_dict["num_prior_edges"]}
mapping = mapping

# Get edges information
edges = edges_dict["edges"]
num_prior_edges = edges_dict["num_prior_edges"]
# Retrieve the selected nodes and edges based on the given vertices and edges
subgraph_nodes = vertices[vertices < len(graph_nodes)]
subgraph_edges = [mapping["edges"][e.item()] for e in edges if e < num_prior_edges]
virtual_vertices = vertices[vertices >= graph.num_nodes]
if len(virtual_vertices) > 0:
    virtual_vertices = vertices[vertices >= len(graph_nodes)]
    virtual_edges = [mapping["nodes"][i.item()] for i in virtual_vertices]
    subgraph_edges = cp.array(subgraph_edges + virtual_edges)
edge_index = edge_index[:, subgraph_edges]
subgraph_nodes = cp.unique(
    cp.concatenate(
        [subgraph_nodes, edge_index[0], edge_index[1]]
    )
)


In [None]:
subgraph = {"nodes": subgraph_nodes, "edges": subgraph_edges}

In [None]:
subgraphs = {}
subgraphs["nodes"] = []
subgraphs["edges"] = []

In [None]:
# Append the extracted subgraph to the dictionary
subgraphs["nodes"].append(subgraph["nodes"].tolist())
subgraphs["edges"].append(subgraph["edges"].tolist())

In [None]:
# Concatenate and get unique node and edge indices
subgraphs["nodes"] = cp.unique(
    cp.concatenate([cp.array(list_) for list_ in subgraphs["nodes"]])
)
subgraphs["edges"] = np.unique(
    cp.concatenate([cp.array(list_) for list_ in subgraphs["edges"]])
)

In [None]:
state["selections"]

In [None]:
subgraphs['nodes']

In [None]:
np.unique(np.array(graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']].node_id.to_arrow().to_pylist()))

In [None]:
np.unique(np.array([graph_edges.iloc[subgraphs['edges']][['head_id', 'tail_id']].head_id.to_arrow().to_pylist() +\
graph_edges.iloc[subgraphs['edges']][['head_id', 'tail_id']].tail_id.to_arrow().to_pylist()]))

In [None]:
import cugraph as cg
import nx_cugraph as nxcg

In [None]:
nxcg.Graph(directed=True)

In [None]:
graph_nodes = nodes_df
graph_edges = edges_df

# Networkx DiGraph construction to be visualized in the frontend
nx_graph = nx.DiGraph()
# Add nodes with attributes
node_colors = {n: cfg.node_colors_dict[k]
                for k, v in state["selections"].items() for n in v}
for n in graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']].to_arrow().to_pandas().node_name:
    nx_graph.add_node(n, color=node_colors.get(n, None))


# Add edges with attributes
edges = zip(
    pyg_graph.edge_index[0].tolist(),
    pyg_graph.edge_index[1].tolist(),
    pyg_graph.edge_type
)
for src, dst, edge_type in edges:
    nx_graph.add_edge(
        pyg_graph.node_name[src],
        pyg_graph.node_name[dst],
        relation=edge_type,
        label=edge_type,
    )


In [None]:
graph_nodes = nodes_df
graph_edges = edges_df

In [None]:
graph_edges[['head_id', 'tail_id', 'edge_type']]

In [None]:
mapping = {n: i for i, n in enumerate(subgraph["nodes"].tolist())}

In [None]:
graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']]

In [None]:
graph_edges.iloc[subgraphs["edges"]]

In [None]:
subgr

In [None]:
from cugraph.experimental import PropertyGraph

# Create PropertyGraph
pG = PropertyGraph()
pG.add_vertex_data(graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']], vertex_col_name="node_id")
pG.add_edge_data(graph_edges.iloc[subgraphs['edges']][['head_id', 'tail_id', 'edge_type']], vertex_col_names=("head_id", "tail_id"))


In [None]:
pG.get_edge_data()

In [None]:
(row._0, row._1, {'edge_type': row.edge_type})

In [None]:
graph_nodes

In [None]:
node_colors = {n: cfg.node_colors_dict[k]
                for k, v in state["selections"].items() for n in v}
node_colors

In [None]:
# Convert the dict to a cudf DataFrame
node_colors = {n: cfg.node_colors_dict[k]
                for k, v in state["selections"].items() for n in v}
color_df = cudf.DataFrame(list(node_colors.items()), columns=["node_id", "color"])

# Merge the color_df with graph_nodes
graph_nodes = graph_nodes.merge(color_df, on="node_id", how="left")


In [None]:
[x for x in graph_nodes.iloc[subgraphs["nodes"]].to_arrow().to_pandas().itertuples(index=False)]

In [None]:
# Convert to list of tuples with node ID and a dictionary of properties
node_tuples = [(row.node_id, {'desc': row.desc, 'enriched_node': row.enriched_node}) 
               for row in graph_nodes.iloc[subgraphs["nodes"]].to_arrow().to_pandas().itertuples(index=False)]
edge_tuples = [(row.head_id, row.tail_id, {'label': tuple(row.edge_type)})
                for row in graph_edges.iloc[subgraphs["edges"]].to_arrow().to_pandas().itertuples(index=False)]
node_tuples


In [None]:
# Textualized graph
# Prepare the textualized subgraph
graph_text = (
    graph_nodes.iloc[subgraphs["nodes"]][['node_id', 'desc']].rename(columns={'desc': 'node_attr'}).to_arrow().to_pandas().to_csv(index=False)
    + "\n"
    + graph_edges.iloc[subgraph["edges"]][['head_id', 'edge_type', 'tail_id']].to_arrow().to_pandas().to_csv(index=False)
)
print(graph_text)


In [None]:
graph_edges.iloc[subgraph["edges"]][['head_id', 'edge_type', 'tail_id']].to_arrow().to_pandas().to_csv(index=False)

In [None]:
# Get the vertex DataFrame from the PropertyGraph
vertex_df = pG.get_vertex_data()

# Convert to list of tuples with node ID and a dictionary of properties
node_tuples = [(row._0, {'desc': row.desc, 'enriched_node': row.enriched_node}) 
               for row in pG.get_vertex_data().to_arrow().to_pandas().itertuples(index=False)]
edge_tuples = [(row._2, row._3, {'label': tuple(row.edge_type)})
                for row in pG.get_edge_data().to_arrow().to_pandas().itertuples(index=False)]
edge_tuples


In [None]:
list(nx_graph.edges(data=True))

In [None]:
list(nx_graph.nodes(data=True))

In [None]:
# Networkx DiGraph construction to be visualized in the frontend
nxcg_graph = cg.Graph(directed=True)
# Add nodes with attributes
node_colors = {n: cfg.node_colors_dict[k]
                for k, v in state["selections"].items() for n in v}
for n in graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']].to_arrow().to_pandas().node_name:
    nxcg_graph.add_node(n, color=node_colors.get(n, None))


In [None]:
graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']]

In [None]:
my_graph = graph_nodes.iloc[subgraphs['nodes']][['node_id', 'node_name', 'node_type', 'desc', 'enriched_node']]
cg_graph.add_nodes_from(my_graph.node_id.to_arrow().to_pylist())

In [None]:
from cugraph.experimental import PropertyGraph
import cudf

# Example node DataFrame with properties
graph_nodes = cudf.DataFrame({
    "node_id": ["A", "B", "C"],
    "color": ["red", "blue", "green"],
    "description": ["start", "middle", "end"]
})

# Example edge DataFrame
graph_edges = cudf.DataFrame({
    "src": ["A", "B", "C"],
    "dst": ["B", "C", "A"],
    "edge_type": ["link", "link", "loop"],
    "weight": [1.0, 2.0, 3.0]
})

# Create PropertyGraph
pG = PropertyGraph()
pG.add_vertex_data(graph_nodes, vertex_col_name="node_id")
pG.add_edge_data(graph_edges, vertex_col_names=("src", "dst"))


In [None]:
pG.get_vertex_data().to_arrow().to_pandas()