In [1]:
import os
import glob
import cudf
import dask_cudf
import hydra

In [2]:
# Load hydra configuration
with hydra.initialize(version_base=None, config_path="../../../aiagents4pharma/talk2knowledgegraphs/configs"):
    cfg = hydra.compose(
        config_name="config", overrides=["tools/multimodal_subgraph_extraction=default"]
    )
    cfg = cfg.tools.multimodal_subgraph_extraction
cfg

{'_target_': 'talk2knowledgegraphs.tools.multimodal_subgraph_extraction', 'ollama_embeddings': ['nomic-embed-text'], 'temperature': 0.1, 'streaming': False, 'topk': 5, 'topk_e': 5, 'cost_e': 0.5, 'c_const': 0.01, 'root': -1, 'num_clusters': 1, 'pruning': 'gw', 'verbosity_level': 0, 'node_id_column': 'node_id', 'node_attr_column': 'node_attr', 'edge_src_column': 'edge_src', 'edge_attr_column': 'edge_attr', 'edge_dst_column': 'edge_dst', 'node_colors_dict': {'gene/protein': '#6a79f7', 'molecular_function': '#82cafc', 'cellular_component': '#3f9b0b', 'biological_process': '#c5c9c7', 'drug': '#c4a661', 'disease': '#80013f'}, 'biobridge': {'source': 'aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal/', 'node_type': ['gene/protein', 'molecular_function', 'cellular_component', 'biological_process', 'drug', 'disease']}}

In [192]:
# Loop over nodes and edges
graph_dict = {}
for element in ["nodes", "edges"]:
    # Make an empty dictionary for each folder
    graph_dict[element] = {}
    for stage in ["enrichment", "embedding"]:
        # print(key, path, stage)
        # Create the file pattern for the current subfolder
        file_list = glob.glob(os.path.join('../../../', 
                                           cfg.biobridge.source, 
                                           element, 
                                           stage, '*.parquet.gzip'))
        # if element != "edges" and stage == "embedding":
        # Read and concatenate all dataframes in the folder
        graph_dict[element][stage] = cudf.concat([cudf.read_parquet(f) for f in file_list], ignore_index=True)

In [4]:
import cugraph_pyg

  from .autonotebook import tqdm as notebook_tqdm
  register_pytree_node(
  register_pytree_node(


In [239]:
unified_nodes_df = graph_dict["nodes"]["enrichment"].merge(
    graph_dict["nodes"]["embedding"],
    on=["node_id"],
    how="left"
)
unified_nodes_df.head(5)

Unnamed: 0,node_index,node_id,node_name,node_type,desc,feat,desc_emb,feat_emb
0,74881,GMP binding_(117117),GMP binding,molecular_function,GMP binding belongs to molecular_function node.,"Binding to GMP, guanosine monophosphate.","[-0.0013809444, 0.045085534, -0.14073932, -0.0...","[-0.006424577, 0.037433047, -0.11737241, -0.00..."
1,74882,dATP binding_(117118),dATP binding,molecular_function,dATP binding belongs to molecular_function node.,"Binding to dATP, deoxyadenosine triphosphate.","[0.006549959, 0.020447463, -0.14197575, -0.045...","[-0.014702543, 0.028529385, -0.12821108, -0.06..."
2,74883,GDP binding_(117119),GDP binding,molecular_function,GDP binding belongs to molecular_function node.,"Binding to GDP, guanosine 5'-diphosphate.","[0.011018618, 0.057139404, -0.16011557, -0.027...","[0.008387973, 0.0810471, -0.15771577, -0.03594..."
3,74884,cyclic-GMP-AMP binding_(117120),cyclic-GMP-AMP binding,molecular_function,cyclic-GMP-AMP binding belongs to molecular_fu...,"Binding to 2',3' cyclic GMP-AMP (cGAMP) nucleo...","[-0.011152258, 0.01837751, -0.15741584, 0.0022...","[-0.050510645, 0.046024974, -0.16424122, -0.04..."
4,74885,dGMP binding_(117121),dGMP binding,molecular_function,dGMP binding belongs to molecular_function node.,"Binding to dGMP, deoxyguanosine monophosphate.","[0.0146955475, 0.04383626, -0.14428326, -0.028...","[0.00027230568, 0.0482017, -0.122989565, -0.03..."


In [241]:
unified_edges_df = graph_dict["edges"]["enrichment"].merge(
    graph_dict["edges"]["embedding"],
    on=["edge_type_str"],
    how="cross"
)

MemoryError: std::bad_alloc: out_of_memory: CUDA error (failed to allocate 1779554560 bytes) at: /pyenv/versions/3.12.9/lib/python3.12/site-packages/librmm/include/rmm/mr/device/cuda_memory_resource.hpp:62: cudaErrorMemoryAllocation out of memory

In [None]:
import torch
from torch_geometric.data import TensorAttr
from cugraph_pyg.data import CuGraphStore, TensorDictFeatureStore

# Create and initialize FeatureStore for each column that can be used as a feature
mapper = {}
fs = TensorDictFeatureStore()
for nt in graph_dict['nodes']['enrichment'].node_type.unique().to_arrow().to_pylist():
    print(f"Node type: {nt}")
    nodes_df = graph_dict['nodes']['enrichment'][graph_dict['nodes']['enrichment'].node_type == nt]
    print(f"Number of nodes: {len(nodes_df)}")

    # Node Index
    node_index = nodes_df["node_index"].to_arrow().to_pylist()
    mapper[nt] = {}
    mapper[nt]['to_node_index'] = {i: node_index[i] for i in range(len(node_index))}
    mapper[nt]['from_node_index'] = {node_index[i]: i for i in range(len(node_index))}
    fs[TensorAttr(group_name=nt, 
                    attr_name="node_index")] = torch.tensor(node_index, dtype=torch.int64)
    
    # Node Features
    nodes_emb_df = graph_dict['nodes']['embedding'][graph_dict['nodes']['embedding'].node_id.isin(nodes_df.node_id)]
    fs[TensorAttr(group_name=nt,
                  attr_name="desc_emb")] = torch.tensor(nodes_emb_df["desc_emb"].to_arrow().to_pylist(), dtype=torch.float32)
    fs[TensorAttr(group_name=nt,
                  attr_name="feat_emb")] = torch.tensor(nodes_emb_df["feat_emb"].to_arrow().to_pylist(), dtype=torch.float32)


Node type: molecular_function
Number of nodes: 10951
===
Node type: disease
Number of nodes: 17054
===
Node type: drug
Number of nodes: 6759
===
Node type: biological_process
Number of nodes: 27409
===
Node type: cellular_component
Number of nodes: 4011
===
Node type: gene/protein
Number of nodes: 18797
===


In [117]:
# Get nodes enrichment and embedding dataframes
enrichment_df = graph_dict['nodes']['enrichment']
embedding_df = graph_dict['nodes']['embedding']

# Convert node_id columns once to set
embedding_node_ids = set(embedding_df['node_id'].to_pandas())


In [None]:
import torch
import cudf
from torch_geometric.data import TensorAttr
from cugraph_pyg.data import TensorDictFeatureStore

# Initialize FeatureStore and mapper
fs = TensorDictFeatureStore()
mapper = {}

# Get nodes enrichment and embedding dataframes
nodes_enrichment_df = graph_dict['nodes']['enrichment']
nodes_embedding_df = graph_dict['nodes']['embedding']

# Get node_ids
embedding_node_ids = set(nodes_embedding_df['node_id'].to_pandas())

# Loop over group enrichment nodes by type 
for nt, nodes_df in nodes_enrichment_df.groupby('node_type'):
    print(f"Node type: {nt}")
    node_count = len(nodes_df)
    print(f"Number of nodes: {node_count}")

    # Get node_index as torch tensor directly
    node_index_tensor = torch.tensor(nodes_df["node_index"].to_numpy(), dtype=torch.int64)
    fs[TensorAttr(group_name=nt, attr_name="node_index")] = node_index_tensor

    # Construct mapper for node_index
    node_index_list = node_index_tensor.tolist()
    mapper[nt] = {
        'to_node_index': dict(enumerate(node_index_list)),
        'from_node_index': {v: i for i, v in enumerate(node_index_list)}
    }

    # Filter embeddings once by node_id
    mask = nodes_embedding_df['node_id'].isin(set(nodes_df['node_id'].to_pandas()))
    nodes_emb_df = nodes_embedding_df[mask]

    # Convert embeddings as tensors and add to FeatureStore
    for attr_name in ["desc_emb", "feat_emb"]:
        emb_tensor = torch.tensor(nodes_emb_df[attr_name].to_arrow().to_pylist(), dtype=torch.float32)
        fs[TensorAttr(group_name=nt, attr_name=attr_name)] = emb_tensor




Node type: biological_process
Number of nodes: 27409




Node type: cellular_component
Number of nodes: 4011




Node type: disease
Number of nodes: 17054




Node type: drug
Number of nodes: 6759




Node type: gene/protein
Number of nodes: 18797




Node type: molecular_function
Number of nodes: 10951




In [236]:
import torch
import cudf
from torch_geometric.data import TensorAttr
from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
# cugraph_pyg.data.GraphStore()
# Initialize FeatureStore and mapper
fs = TensorDictFeatureStore()
mapper = {}

# Get nodes enrichment and embedding dataframes
nodes_enrichment_df = graph_dict['nodes']['enrichment']
nodes_embedding_df = graph_dict['nodes']['embedding']

# Loop over group enrichment nodes by type 
for nt, nodes_df in nodes_enrichment_df.groupby('node_type'):
    print(f"Node type: {nt}")
    node_count = len(nodes_df)
    print(f"Number of nodes: {node_count}")


    # Get node_ids
    emb_df = nodes_embedding_df[nodes_embedding_df['node_id'].isin(nodes_df['node_id'])]

    # Sort both by node_id for alignment
    nodes_df_sorted = nodes_df.sort_values('node_id')
    emb_df_sorted = emb_df.sort_values('node_id')

    # Ensure sorted node_ids match
    assert cudf.Series.equals(nodes_df_sorted['node_id'].reset_index(drop=True),
                              emb_df_sorted['node_id'].reset_index(drop=True)), \
                                f"Node ID mismatch in {nt} after sorting"

    # Get node_index as torch tensor directly
    node_index_tensor = torch.tensor(nodes_df_sorted["node_index"].to_numpy(), dtype=torch.int64)
    fs[TensorAttr(group_name=nt, attr_name="node_index")] = node_index_tensor

    # # Construct mapper for node_index
    node_index_list = nodes_df_sorted["node_index"].to_numpy().tolist()
    mapper[nt] = {
        'to_node_index': dict(enumerate(node_index_list)),
        'from_node_index': {v: i for i, v in enumerate(node_index_list)}
    }

    # Convert embeddings as tensors and add to FeatureStore
    for attr_name in ["desc_emb", "feat_emb"]:
        emb_tensor = torch.tensor(emb_df_sorted[attr_name].to_arrow().to_pylist(), dtype=torch.float32)
        fs[TensorAttr(group_name=nt, attr_name=attr_name)] = emb_tensor




Node type: biological_process
Number of nodes: 27409




Node type: cellular_component
Number of nodes: 4011




Node type: disease
Number of nodes: 17054




Node type: drug
Number of nodes: 6759




Node type: gene/protein
Number of nodes: 18797




Node type: molecular_function
Number of nodes: 10951




In [237]:
# Initialize GraphStore
gs = GraphStore()

# Get edges enrichment and embedding dataframes
edges_enrichment_df = graph_dict['edges']['enrichment']
edges_embedding_df = graph_dict['edges']['embedding']

# Loop over edge types 
for edge_type_str in edges_enrichment_df['edge_type_str'].unique().to_arrow().to_pylist():
    src_type, rel_type, tgt_type = edge_type_str.split('|')

    # Filter edges for this edge_type_str once
    filtered_df = edges_enrichment_df[edges_enrichment_df['edge_type_str'] == edge_type_str][['head_index', 'tail_index']]

    # Convert mapper dicts to cudf Series for vectorized mapping
    src_map = cudf.Series(mapper[src_type]['from_node_index'])
    tgt_map = cudf.Series(mapper[tgt_type]['from_node_index'])

    # Vectorized mapping of head_index and tail_index using replace (works like dict lookup)
    mapped_head = filtered_df['head_index'].replace(src_map).astype('int64')
    mapped_tail = filtered_df['tail_index'].replace(tgt_map).astype('int64')

    # Check if mapping was successful
    if mapped_head.isnull().any() or mapped_tail.isnull().any():
            raise ValueError(f"Mapping failure for edge type {edge_type_str}")

    # Edge index
    edge_index = torch.tensor(
        cudf.concat([mapped_head, mapped_tail], axis=1).to_pandas().values.T,
        dtype=torch.long
    ).contiguous()

    # Store edge index in the GraphStore
    gs[(src_type, rel_type, tgt_type), "coo"] = edge_index


    # Store edge embeddings in the FeatureStore
    edge_emb_df = edges_embedding_df[edges_embedding_df['edge_type_str'] == edge_type_str]

    # Convert edge embeddings to torch tensor
    edge_emb_tensor = torch.tensor(edge_emb_df['edge_emb'].to_arrow().to_pylist(), dtype=torch.float32)
    fs[TensorAttr(group_name=(src_type, rel_type, tgt_type), attr_name='edge_emb')] = edge_emb_tensor
