In [1]:
# Import necessary libraries
import os
import networkx as nx
import openai 
import matplotlib.pyplot as plt
import sys
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import hydra
import numpy as np
# import pandas as pd
import cudf
import pickle
import torch

sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.utils.extractions.multimodal_pcst import MultimodalPCSTPruning
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.sentence_transformer import EmbeddingWithSentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["OPENAI_API_KEY"] = "XXX"
# Make sure to replace "your_api_key" with your actual API key.

In [3]:
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
# Load hydra configuration
with hydra.initialize(version_base=None, config_path="../../../aiagents4pharma/talk2knowledgegraphs/configs"):
    cfg = hydra.compose(
        config_name="config", overrides=["tools/multimodal_subgraph_extraction=default"]
    )
    cfg = cfg.tools.multimodal_subgraph_extraction
cfg

{'_target_': 'talk2knowledgegraphs.tools.multimodal_subgraph_extraction', 'ollama_embeddings': ['nomic-embed-text'], 'temperature': 0.1, 'streaming': False, 'topk': 5, 'topk_e': 5, 'cost_e': 0.5, 'c_const': 0.01, 'root': -1, 'num_clusters': 1, 'pruning': 'gw', 'verbosity_level': 0, 'node_id_column': 'node_id', 'node_attr_column': 'node_attr', 'edge_src_column': 'edge_src', 'edge_attr_column': 'edge_attr', 'edge_dst_column': 'edge_dst', 'node_colors_dict': {'gene/protein': '#6a79f7', 'molecular_function': '#82cafc', 'cellular_component': '#3f9b0b', 'biological_process': '#c5c9c7', 'drug': '#c4a661', 'disease': '#80013f'}}

In [5]:
# Define state
state = {
    "llm_model": ChatOpenAI(model="gpt-4o-mini", temperature=0.0),
    "embedding_model": OpenAIEmbeddings(model="text-embedding-3-small"),
    "selected_genes": [], #["IL6_(1567)", "IL21_(34967)", "TNF_(2329)"],
    "selected_drugs": [], #["Remdesivir_(15267)", "Mesalazine_(15876)"],
    "uploaded_files": [
        {
            "file_name": "multimodal-analysis.csv",
            "file_path": '../../../aiagents4pharma/talk2knowledgegraphs/tests/files/multimodal-analysis.csv',
            "file_type": "multimodal",
            "uploaded_by": "VPEUser",
            "uploaded_timestamp": "2024-11-05 00:00:00",
        },
    ],
    "topk_nodes": 10,
    "topk_edges": 10,
    "dic_source_graph": [
        {
            "name": "PrimeKG",
            "kg_pyg_path": "../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal_pyg_graph.pkl",
            "kg_text_path": "../../../aiagents4pharma/talk2knowledgegraphs/tests/files/biobridge_multimodal_text_graph.pkl",
        }
    ],
    "dic_extracted_graph": []
}

# Define prompt
prompt = """
Extract all relevant information related to nodes of genes related to inflammatory bowel disease (IBD) 
that existed in the knowledge graph.

Please set the extraction name for this process as `subkg_12345`.
"""

In [6]:
# Retrieve source graph from the state
initial_graph = {}
initial_graph["source"] = state["dic_source_graph"][-1]  # The last source graph as of now
# logger.log(logging.INFO, "Source graph: %s", source_graph)

# Load the knowledge graph
with open(initial_graph["source"]["kg_pyg_path"], "rb") as f:
    initial_graph["pyg"] = pickle.load(f)
# with open(initial_graph["source"]["kg_text_path"], "rb") as f:
#     initial_graph["text"] = pickle.load(f)

pyg_graph = initial_graph["pyg"]

In [7]:
prompt_emb = [EmbeddingWithOllama(model_name=cfg.ollama_embeddings[0]).embed_query(prompt)]

INFO:httpx:HTTP Request: GET http://127.0.0.1:11434/api/tags "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


In [8]:
# Load the data from the parquet files
local_dir = '../../../aiagents4pharma/talk2knowledgegraphs/tests/files'
nodes_df = cudf.read_parquet(os.path.join(local_dir, 'biobridge_nodes.parquet.gzip'))
edges_df = cudf.read_parquet(os.path.join(local_dir, 'biobridge_edges.parquet.gzip'))

In [9]:
# Initialize dataframes
multimodal_df = cudf.DataFrame({"name": [], "node_type": []})
query_df = cudf.DataFrame({"node_id": [],
                            "node_type": [],
                            "x": [],
                            "desc_x": [],
                            "use_description": []})

# Loop over the uploaded files and find multimodal files
for i in range(len(state["uploaded_files"])):
    # Check if multimodal file is uploaded
    if state["uploaded_files"][i]["file_type"] == "multimodal":
        # Read the Excel file
        multimodal_df = cudf.read_csv(state["uploaded_files"][i]["file_path"])

# Check if the multimodal_df is empty
if len(multimodal_df) > 0:
    # Merge all obtained dataframes into a single dataframe
    multimodal_df.rename(columns={"name": "q_node_name", "node_type": "q_node_type"}, inplace=True)

    # Make and process a query dataframe by merging the graph_df and multimodal_df
    query_df = nodes_df[['node_id', 'node_name', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']].merge(multimodal_df, how='cross')
    query_df['q_node_name'] = query_df['q_node_name'].str.lower()
    query_df['node_name'] = query_df['node_name'].str.lower()
    # Get the mask for filtering based on the query
    mask = (
        query_df['node_name'].str.contains(query_df['q_node_name']) &
        (query_df['node_type'] == query_df['q_node_type'])
    )
    query_df = query_df[mask]
    query_df = query_df[['node_id', 'node_type', 'enriched_node', 'x', 'desc', 'desc_x']].reset_index(drop=True)
    query_df['use_description'] = False # set to False for modal-specific embeddings

    # Update the state by adding the the selected node IDs
    state["selections"] = query_df.to_pandas().groupby("node_type")["node_id"].apply(list).to_dict()

# Append a user prompt to the query dataframe
query_df = cudf.concat([
    query_df,
    cudf.DataFrame({
        'node_id': 'user_prompt',
        'node_type': 'prompt',
        # 'enriched_node': prompt,
        'x': prompt_emb,
        # 'desc': prompt,
        'desc_x': prompt_emb,
        'use_description': True # set to True for user prompt embedding
    })
]).reset_index(drop=True)


### Before

In [10]:
from torch_geometric.data import Data

topk = state["topk_nodes"]  
topk_e = state["topk_edges"]
c_const = 0.01

def _compute_node_prizes(graph: Data,
                         query_emb: torch.Tensor,
                         modality: str,
                         use_description: bool=False) :
    """
    Compute the node prizes based on the cosine similarity between the query and nodes.

    Args:
        graph: The knowledge graph in PyTorch Geometric Data format.
        query_emb: The query embedding in PyTorch Tensor format. This can be an embedding of
            a prompt, sequence, or any other feature to be used for the subgraph extraction.
        modality: The modality to use for the subgraph extraction based on the node type.

    Returns:
        The prizes of the nodes.
    """
    # Convert PyG graph to a DataFrame
    graph_df = cudf.DataFrame({
        "node_type": graph.node_type,
        "desc_x": [x.tolist() for x in graph.desc_x],
        "x": [list(x) for x in graph.x],
        "score": [0.0 for _ in range(len(graph.node_id))],
    })

    # Calculate cosine similarity for text features and update the score
    if use_description:
        graph_df.loc[:, "score"] = torch.nn.CosineSimilarity(dim=-1)(
                query_emb,
                torch.tensor(list(graph_df.desc_x.values)) # Using textual description features
            ).tolist()
    else:
        graph_df.loc[graph_df["node_type"] == modality,
                        "score"] = torch.nn.CosineSimilarity(dim=-1)(
                query_emb,
                torch.tensor(list(graph_df[graph_df["node_type"]== modality].x.values))
            ).tolist()

    # Set the prizes for nodes based on the similarity scores
    n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
    # n_prizes = torch.nn.CosineSimilarity(dim=-1)(query_emb, graph.x)
    topk = min(topk, graph.num_nodes)
    _, topk_n_indices = torch.topk(n_prizes, topk, largest=True)
    n_prizes = torch.zeros_like(n_prizes)
    n_prizes[topk_n_indices] = torch.arange(topk, 0, -1).float()

    return n_prizes

def _compute_edge_prizes(graph: Data,
                         text_emb: torch.Tensor) :
    """
    Compute the node prizes based on the cosine similarity between the query and nodes.

    Args:
        graph: The knowledge graph in PyTorch Geometric Data format.
        text_emb: The textual description embedding in PyTorch Tensor format.

    Returns:
        The prizes of the nodes.
    """
    # Note that as of now, the edge features are based on textual features
    # Compute prizes for edges
    e_prizes = torch.nn.CosineSimilarity(dim=-1)(text_emb, graph.edge_attr)
    unique_prizes, inverse_indices = e_prizes.unique(return_inverse=True)
    topk_e = min(topk_e, unique_prizes.size(0))
    topk_e_values, _ = torch.topk(unique_prizes, topk_e, largest=True)
    e_prizes[e_prizes < topk_e_values[-1]] = 0.0
    last_topk_e_value = topk_e
    for k in range(topk_e):
        indices = inverse_indices == (
            unique_prizes == topk_e_values[k]
        ).nonzero(as_tuple=True)[0]
        value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
        e_prizes[indices] = value
        last_topk_e_value = value * (1 - c_const)

    return e_prizes

In [55]:
import pandas as pd

graph = initial_graph["pyg"]
text_emb = torch.tensor(query_df.iloc[0]['desc_x'][0])
query_emb = torch.tensor(query_df.iloc[0]['x'][0])
modality = query_df.iloc[0]['node_type'][0]

# Convert PyG graph to a DataFrame
graph_df = pd.DataFrame({
    "node_type": graph.node_type,
    "desc_x": [x.tolist() for x in graph.desc_x],
    "x": [list(x) for x in graph.x],
    "score": [0.0 for _ in range(len(graph.node_id))],
})

graph_df.loc[graph_df["node_type"] == modality, "score"]  = torch.nn.CosineSimilarity(dim=-1)(
        query_emb,
        torch.tensor(list(graph_df[graph_df["node_type"]== modality].x.values))
    ).tolist()

In [56]:
# Set the prizes for nodes based on the similarity scores
n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
topk = min(topk, graph.num_nodes)
_, topk_n_indices = torch.topk(n_prizes, topk, largest=True)
n_prizes = torch.zeros_like(n_prizes)
n_prizes[topk_n_indices] = torch.arange(topk, 0, -1).float()
n_prizes

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [57]:
n_prizes[:150]

tensor([ 0.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  2.,  0.,  0.,  8.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  6.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [190]:
c_const = cfg.c_const

e_prizes = torch.nn.CosineSimilarity(dim=-1)(torch.tensor(text_emb), torch.tensor([list(f) for f in graph_edges.edge_attr.to_arrow().to_pandas().values]))
unique_prizes, inverse_indices = e_prizes.unique(return_inverse=True)
topk_e = min(topk_e, unique_prizes.size(0))
topk_e_values, _ = torch.topk(unique_prizes, topk_e, largest=True)
e_prizes[e_prizes < topk_e_values[-1]] = 0.0
last_topk_e_value = topk_e
for k in range(topk_e):
    print(k, (unique_prizes == topk_e_values[k]).nonzero())
    indices = inverse_indices == (
        unique_prizes == topk_e_values[k]
    ).nonzero(as_tuple=True)[0]
    value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
    e_prizes[indices] = value
    last_topk_e_value = value * (1 - c_const)


0 tensor([[11271]])
1 tensor([[11270]])
2 tensor([[11269]])
3 tensor([[11268]])
4 tensor([[11267]])
5 tensor([[11266]])
6 tensor([[11265]])
7 tensor([[11264]])
8 tensor([[11263]])
9 tensor([[11262]])


In [194]:
e_prizes.nonzero()

tensor([[ 920],
        [ 921],
        [1760],
        [1846],
        [3504],
        [4200],
        [4592],
        [4958],
        [5349],
        [5359]])

In [85]:
e_prizes[:10]

tensor([0.5868, 0.6176, 0.6151, 0.6329, 0.6150, 0.5986, 0.5996, 0.6031, 0.5865,
        0.6117], dtype=torch.float64)

### After

In [48]:
def _compute_sim_scores(features_a, features_b, metric="cosine"):
    scores = cuvs.distance.pairwise_distance(features_a, features_b, metric=metric)
    scores = 1 - cp.asarray(scores).ravel()
    return scores

In [156]:
graph_nodes["desc_x"].list.leaves.to_cupy().reshape(-1, len(graph_nodes["desc_x"][0]))

array([[ 2.9749377e-02,  5.3500228e-02, -1.7067130e-01, ...,
        -4.5734297e-02, -7.1615368e-02,  6.2878663e-03],
       [ 2.8421732e-02,  1.9860065e-02, -1.6853006e-01, ...,
         7.2220434e-03, -6.1540674e-02,  1.5545690e-04],
       [ 3.6688470e-03,  5.1380560e-02, -1.3865656e-01, ...,
        -1.2962267e-02, -4.3269057e-02, -2.5594452e-02],
       ...,
       [ 2.8095482e-02,  8.4792199e-03, -1.3168752e-01, ...,
         7.8612315e-03, -3.0251797e-02,  1.1074975e-02],
       [ 3.3906680e-02,  2.0156756e-02, -1.2842590e-01, ...,
         1.8550608e-02, -4.3096650e-02,  7.9433639e-03],
       [ 2.5244711e-03,  7.4240014e-02, -1.0902149e-01, ...,
         4.3608896e-02, -6.0133599e-02,  1.7989967e-02]], dtype=float32)

In [185]:
graph_nodes = nodes_df
topk = 10
use_description = False
# query_emb = torch.tensor(query_df.iloc[0]['x'][0]) # torch.Size([2560])
text_emb = cp.array(query_df.iloc[0]['desc_x'][0]).reshape(1, -1).astype(cp.float32)
query_emb = cp.array(query_df.iloc[0]['x'][0]).reshape(1, -1).astype(cp.float32)

# Initialize variables
sim_scores = cudf.Series(cp.zeros(len(graph_nodes), dtype=cp.float32))
mask = (graph_nodes.node_type == modality)

# Calculate cosine similarity for text features and update the score
if use_description:
    sim_scores = _compute_sim_scores(
        graph_nodes["desc_x"].list.leaves.to_cupy().reshape(-1, len(graph_nodes["desc_x"][0])).astype(cp.float32),
        query_emb
    )  # shape [N, 1]
else:
    sim_scores[mask] = _compute_sim_scores(
        graph_nodes[mask]["x"].list.leaves.to_cupy().reshape(-1, len(graph_nodes[mask]["x"][0])).astype(cp.float32),
        query_emb
    )  # shape [N, 1]

# Set the prizes for nodes based on the similarity scores
n_prizes = torch.tensor(graph_df.score.values, dtype=torch.float32)
topk = min(topk, sim_scores.size)
n_prizes = cudf.Series(0.0, index=cp.arange(sim_scores.size))
n_prizes[(-sim_scores).sort_values()[:topk].index] = cp.arange(topk, 0, -1).astype(cp.float32)
n_prizes = n_prizes.to_cupy()

In [195]:
# sim_scores = cudf.Series(cp.zeros(len(graph_edges), dtype=cp.float32))

e_prizes = _compute_sim_scores(
    graph_edges["edge_attr"].list.leaves.to_cupy().reshape(-1, len(graph_edges["edge_attr"][0])).astype(cp.float32),
    text_emb)

unique_prizes, inverse_indices = cp.unique(e_prizes, return_inverse=True)
topk_e = min(topk_e, sim_scores.size) 
topk_e_values = unique_prizes[cp.argsort(-unique_prizes)[:topk_e]]
e_prizes[e_prizes < topk_e_values[-1]] = 0.0
last_topk_e_value = topk_e
for k in range(topk_e):
    indices = inverse_indices == (unique_prizes == topk_e_values[k]).nonzero()[0]
    value = min((topk_e - k) / indices.sum().item(), last_topk_e_value)
    e_prizes[indices] = value
    last_topk_e_value = value * (1 - c_const)


In [202]:
prizes = {"nodes": n_prizes, "edges": e_prizes}

In [204]:
cost_e = cfg.cost_e

# Logic to reduce the cost of the edges such that at least one edge is selected
updated_cost_e = min(
    cost_e,
    prizes["edges"].max().item() * (1 - c_const / 2),
)

In [227]:
initial_graph["pyg"].edge_index.T.numpy()

array([[   0,  854],
       [   0,  840],
       [   0, 2113],
       ...,
       [2988,   22],
       [2989,   15],
       [2990,   65]])

In [223]:
graph_nodes[graph_nodes.node_id.isin(["nucleus_(56073)"])][["node_name", "node_id"]]

Unnamed: 0,node_name,node_id
2076,nucleus,nucleus_(56073)


In [213]:
graph_edges[['head_id', 'tail_id']]

Unnamed: 0,head_id,tail_id
0,Rose bengal_(14118),LTF_(3233)
1,Fluticasone furoate_(14038),ABCB1_(4152)
2,Technetium Tc-99m tetrofosmin_(14555),ABCB1_(4152)
3,Fluticasone_(14040),ABCB1_(4152)
4,Levothyroxine_(14060),ABCB1_(4152)
...,...,...
11267,negative regulation of peroxidase activity_(51...,LRRK2_(2111)
11268,regulation of kidney size_(52358),LRRK2_(2111)
11269,negative regulation of thioredoxin peroxidase ...,LRRK2_(2111)
11270,cell surface bile acid receptor signaling path...,GPBAR1_(22105)


In [226]:
edge_index.T

array([[  70, 2097],
       [  35, 2097],
       [  38, 2097],
       ...,
       [   9,  721],
       [   9,  707],
       [   9,  714]])

In [225]:
edge_index

array([[  70,   35,   38, ...,    9,    9,    9],
       [2097, 2097, 2097, ...,  721,  707,  714]])

In [221]:
edges

Unnamed: 0,triplet_index,head_id,tail_id,display_relation,edge_type,enriched_edge,edge_attr,head_index
0,7696,nucleus_(56073),PPARA_(1122),interacts with,"[cellular_component, interacts with, gene/prot...",nucleus (cellular_component) has a direct rela...,"[0.041571546, 0.030477023, -0.11966289, -0.013...",2076
1,7697,nucleus_(56073),PPARG_(989),interacts with,"[cellular_component, interacts with, gene/prot...",nucleus (cellular_component) has a direct rela...,"[0.04305989, 0.031666573, -0.123712145, -0.024...",2076
2,7698,nucleus_(56073),RELA_(772),interacts with,"[cellular_component, interacts with, gene/prot...",nucleus (cellular_component) has a direct rela...,"[0.043657485, 0.0103547415, -0.12304908, -0.00...",2076
3,7699,nucleus_(56073),STAT3_(729),interacts with,"[cellular_component, interacts with, gene/prot...",nucleus (cellular_component) has a direct rela...,"[0.037455212, 0.026841054, -0.140134, -0.01149...",2076
4,7700,nucleus_(56073),TCF7_(5195),interacts with,"[cellular_component, interacts with, gene/prot...",nucleus (cellular_component) has a direct rela...,"[0.048932377, 0.03551207, -0.13935399, -0.0187...",2076
...,...,...,...,...,...,...,...,...
11267,126,"(5R,6E,8Z,11Z,14Z,17Z)-5-hydroxyicosa-6,8,11,1...",PPARG_(989),target,"[drug, target, gene/protein]","(5R,6E,8Z,11Z,14Z,17Z)-5-hydroxyicosa-6,8,11,1...","[0.0073565487, 0.0321003, -0.14682753, -5.1667...",709
11268,127,"(8E,10S,12Z)-10-hydroxy-6-oxooctadeca-8,12-die...",PPARG_(989),target,"[drug, target, gene/protein]","(8E,10S,12Z)-10-hydroxy-6-oxooctadeca-8,12-die...","[0.001404367, 0.022235041, -0.14530781, -0.007...",710
11269,117,AMG-131_(16610),PPARG_(989),target,"[drug, target, gene/protein]",AMG-131 (drug) has a direct relationship of dr...,"[0.0015350897, 0.0056261267, -0.1251885, 0.011...",704
11270,122,"(2S)-3-(1-{[2-(2-CHLOROPHENYL)-5-METHYL-1,3-OX...",PPARG_(989),target,"[drug, target, gene/protein]","(2S)-3-(1-{[2-(2-CHLOROPHENYL)-5-METHYL-1,3-OX...","[0.008275107, 0.029084584, -0.13066573, 0.0019...",705


In [233]:
def _create_edge_index(graph_nodes, graph_edges):
    # Create and additional node_index column
    graph_nodes = graph_nodes.reset_index(drop=True)
    graph_nodes['node_index'] = graph_nodes.index

    # Get head_index and tail_index
    edges = graph_edges.merge(graph_nodes[['node_id', 'node_index']],
                            left_on='head_id', right_on='node_id',
                            how='left').rename(columns={'node_index': 'head_index'}).drop(columns=['node_id'])
    edges = edges.merge(graph_nodes[['node_id', 'node_index']],
                        left_on='tail_id', right_on='node_id',
                        how='left').rename(columns={'node_index': 'tail_index'}).drop(columns=['node_id'])

    # Stacking to get into edge_index
    edge_index = cp.stack([
        edges['head_index'].to_cupy(),
        edges['tail_index'].to_cupy()
    ])

    return edge_index

In [252]:
prizes = {"nodes": n_prizes, "edges": e_prizes}

cost_e = cfg.cost_e

# Logic to reduce the cost of the edges such that at least one edge is selected
updated_cost_e = min(
    cost_e,
    prizes["edges"].max().item() * (1 - c_const / 2),
)

# Initialize variables
edges = []
costs = []
virtual = {
    "n_prizes": [],
    "edges": [],
    "costs": [],
}
mapping = {"nodes": {}, "edges": {}}

# Compute the costs, edges, and virtual variables based on the prizes
for i, (src, dst) in enumerate(_create_edge_index(graph_nodes, graph_edges).T):
    prize_e = prizes["edges"][i]
    if prize_e <= updated_cost_e:
        mapping["edges"][len(edges)] = i
        edges.append((src, dst))
        costs.append(updated_cost_e - prize_e)
    else:
        virtual_node_id = graph_nodes.shape[0] + len(virtual["n_prizes"])
        mapping["nodes"][virtual_node_id] = i
        virtual["edges"].append((src, virtual_node_id))
        virtual["edges"].append((virtual_node_id, dst))
        virtual["costs"].append(0)
        virtual["costs"].append(0)
        virtual["n_prizes"].append(prize_e - updated_cost_e)
prizes = cp.concatenate([prizes["nodes"], cp.array(virtual["n_prizes"])])
edges_dict = {}
edges_dict["edges"] = edges
edges_dict["num_prior_edges"] = len(edges)
# Final computation of the costs and edges based on the virtual costs and virtual edges
if len(virtual["costs"]) > 0:
    costs = cp.array(costs + virtual["costs"])
    edges = cp.array(edges + virtual["edges"])
    edges_dict["edges"] = edges

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

In [256]:
edges

[(array(70), array(2097)),
 (array(35), array(2097)),
 (array(38), array(2097)),
 (array(2), array(2097)),
 (array(3), array(2097)),
 (array(46), array(2097)),
 (array(15), array(2097)),
 (array(36), array(2097)),
 (array(0), array(2097)),
 (array(68), array(2097)),
 (array(8), array(2097)),
 (array(43), array(2097)),
 (array(7), array(2097)),
 (array(31), array(2097)),
 (array(37), array(2097)),
 (array(62), array(2097)),
 (array(2037), array(23)),
 (array(2037), array(837)),
 (array(2037), array(22)),
 (array(2928), array(7)),
 (array(2928), array(837)),
 (array(2946), array(22)),
 (array(2052), array(22)),
 (array(2036), array(60)),
 (array(2966), array(20)),
 (array(2966), array(29)),
 (array(2043), array(22)),
 (array(2094), array(26)),
 (array(2978), array(4)),
 (array(2055), array(4)),
 (array(2054), array(26)),
 (array(2054), array(23)),
 (array(634), array(11)),
 (array(245), array(11)),
 (array(247), array(11)),
 (array(670), array(11)),
 (array(248), array(11)),
 (array(671)

In [254]:
cp.array(costs).shape

(11262,)

In [249]:
len(virtual["costs"])

20

In [246]:
costs 

[array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=float32),
 array(0.5, dtype=fl

In [242]:
prizes

array([0. , 0. , 0. , ..., 6.5, 1.5, 0.5])

In [238]:
cp.array(virtual["n_prizes"])

array([9.5, 8.5, 5.5, 7.5, 3.5, 2.5, 4.5, 6.5, 1.5, 0.5], dtype=float32)

In [179]:
e_prizes

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [171]:
(unique_prizes == topk_e_values[k]).nonzero()[0]

array([11235])

In [166]:
topk_e_values[-1]

tensor(0.8291)

In [162]:
torch.topk(torch.tensor(unique_prizes), topk_e, largest=True)

torch.return_types.topk(
values=tensor([0.8605, 0.8542, 0.8501, 0.8449, 0.8449, 0.8359, 0.8345, 0.8335, 0.8291,
        0.8291]),
indices=tensor([11235, 11234, 11233, 11232, 11231, 11230, 11229, 11228, 11227, 11226]))

In [164]:
# topk_indices = cp.argsort(-unique_prizes)[:topk_e]  # negate for descending sort
topk_values = unique_prizes[cp.argsort(-unique_prizes)[:topk_e]]

topk_values

array([0.8605221 , 0.854154  , 0.8501374 , 0.84491146, 0.84488064,
       0.8358533 , 0.8345091 , 0.8335018 , 0.82913005, 0.8290656 ],
      dtype=float32)

In [124]:
graph_edges["edge_attr"]

0        [0.071049586, 0.0060329223, -0.17035195, 0.001...
1        [0.025471492, 0.054160915, -0.17022943, -0.018...
2        [-0.008589362, 0.06356438, -0.14342338, -0.003...
3        [0.021936357, 0.05227478, -0.16180754, -0.0218...
4        [0.023618879, 0.018524365, -0.1605938, 0.00940...
                               ...                        
11267    [0.054238185, 0.039940108, -0.11689833, 0.0325...
11268    [0.0384927, 0.091813855, -0.13382651, -0.00920...
11269    [0.02839093, 0.02736938, -0.13177814, 0.027912...
11270    [0.0021018896, 0.041141026, -0.11665152, 0.034...
11271    [0.027027342, 0.040707666, -0.12125126, -0.021...
Name: edge_attr, Length: 11272, dtype: list

In [122]:
e_prizes_[:10]

array([0.58684665, 0.6176255 , 0.6151459 , 0.6328595 , 0.6150071 ,
       0.5986367 , 0.5995761 , 0.60312426, 0.58653235, 0.6116463 ],
      dtype=float32)

In [105]:
# Get unique prizes and their inverse indices
unique_prizes = cudf.Series(e_prizes_).dropna().drop_duplicates().sort(ascending=False).reset_index(drop=True)
inverse_indices = cudf.Series(e_prizes_).map_unique(unique_prizes)

unique_prizes, inverse_indices

AttributeError: 'Series' object has no attribute 'sort'

In [103]:
cudf.Series(e_prizes_).unique()

0        0.586836
1        0.617629
2        0.615144
3        0.632852
4        0.615014
           ...   
11231    0.585824
11232    0.629816
11233    0.598749
11234    0.643440
11235    0.587226
Length: 11236, dtype: float32

In [100]:
cp.unique(e_prizes_, return_inverse=True)

(array([0.5285736, 0.5327324, 0.5391505, ..., 0.8501374, 0.854154 ,
        0.8605221], dtype=float32),
 array([ 827, 4846, 4486, ..., 2096, 7737,  861]))

In [96]:
unique_prizes, inverse_indices

(tensor([0.5286, 0.5327, 0.5392,  ..., 0.8501, 0.8542, 0.8605],
        dtype=torch.float64),
 tensor([ 828, 4863, 4500,  ..., 2102, 7768,  862]))

11272

In [90]:
unique_prizes.size(0)

11272

In [86]:
e_prizes[:10]

tensor([0.5868, 0.6176, 0.6151, 0.6329, 0.6150, 0.5986, 0.5996, 0.6031, 0.5865,
        0.6117], dtype=torch.float64)

In [89]:
unique_prizes, inverse_indices = e_prizes.unique(return_inverse=True)
unique_prizes, inverse_indices

(tensor([0.5286, 0.5327, 0.5392,  ..., 0.8501, 0.8542, 0.8605],
        dtype=torch.float64),
 tensor([ 828, 4863, 4500,  ..., 2102, 7768,  862]))

In [87]:
e_prizes_[:10]

array([0.58683634, 0.61762935, 0.6151438 , 0.63285184, 0.61501414,
       0.5986267 , 0.5995772 , 0.60313183, 0.58654726, 0.6116637 ],
      dtype=float32)

In [13]:
import cudf
import cupy as cp
import cuvs
from cuvs.distance import pairwise_distance

# Initialize several variables
sim_scores = cudf.Series(cp.zeros(len(nodes_df), dtype=cp.float32))
text_emb = torch.tensor(query_df.iloc[0]['desc_x'][0]) # torch.Size([768])
query_emb = torch.tensor(query_df.iloc[0]['x'][0]) # torch.Size([2560])
modality = query_df.iloc[0]['node_type'][0] # `gene/protein`

# Compute cosine distance and similarity
mask = (nodes_df.node_type == modality)
cosine_distance = pairwise_distance(cp.array(nodes_df[mask]["x"].to_arrow().to_pylist()).astype(cp.float32), 
                                    cp.array(query_emb.cpu().numpy()).reshape(1, -1).astype(cp.float32), 
                                    metric="cosine")  # shape [N, 1]
cosine_similarity = 1 - cp.asarray(cosine_distance).ravel()

# Store scores in the graph_df
sim_scores[mask] = cosine_similarity

In [32]:
cosine_similarity

array([0.97628474, 0.8985084 , 0.91741973, 0.9744791 , 0.9627971 ,
       1.0000006 , 0.96820575, 0.8944925 , 0.94562733, 0.9785065 ,
       0.97556895, 0.97704417, 0.98955095, 0.9467787 , 0.9746277 ,
       0.9782953 , 0.9550142 , 0.9491312 , 0.9711893 , 0.9494204 ,
       0.9682751 , 0.93437517, 0.9807807 , 0.9611754 , 0.94604367,
       0.951892  , 0.93231875, 0.9732437 , 0.80831623, 0.90658796,
       0.9759815 , 0.9287918 , 0.9248804 , 0.96580416, 0.95270985,
       0.7955291 , 0.9495744 , 0.97411007, 0.9529466 , 0.9686608 ,
       0.9751882 , 0.96898425, 0.97170717, 0.9425371 , 0.9780081 ,
       0.97575784, 0.95195335, 0.95414084, 0.93240297, 0.94446564,
       0.9207041 , 0.9587114 , 0.9603107 , 0.9509289 , 0.9755834 ,
       0.94682467, 0.92513883, 0.96698356, 0.91640013, 0.97954285,
       0.9578175 , 0.9454702 , 0.97647333, 0.9128767 , 0.96761906,
       0.9683284 , 0.9735785 , 0.94610745, 0.9410495 , 0.9675142 ,
       0.9416015 , 0.9739566 , 0.9711153 , 0.9286345 , 0.96724

In [34]:
topk = min(topk, sim_scores.size)
n_prizes = cudf.Series(0.0, index=cp.arange(sim_scores.size))
n_prizes[(-sim_scores).sort_values()[:topk].index] = cp.arange(topk, 0, -1).astype(cp.float32)

In [38]:
n_prizes.to_arrow().to_pylist()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 10.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 0.0,
 8.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 7.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.

In [31]:
n_prizes.to_arrow().to_pylist()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 10.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 0.0,
 8.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 7.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.

In [27]:
[i==j for i, j in zip(n_prizes, n_prizes_)]

AttributeError: 'Tensor' object has no attribute 'to_arrow'

In [14]:
sim_scores.sort_values(ascending=False).index[:10]

INFO:numba.cuda.cudadrv.driver:init


Index([5, 845, 12, 82, 22, 849, 850, 59, 9, 15], dtype='int64')

In [15]:
topk_n_indices

tensor([  5, 845,  12,  82,  22, 849, 850,  59,   9,  15])

In [None]:
sim_scores.sort_values()

In [None]:
cp.arange(topk, 0, -1)

In [24]:
torch.tensor(n_prizes_)[:150]

tensor([ 0.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  2.,  0.,  0.,  8.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  6.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64)

In [None]:
sim_scores[sim_scores.isna()]

In [None]:
a, topk_n_indices = torch.topk(n_prizes, topk, largest=True)

In [None]:
a

In [None]:
topk_n_indices

In [None]:
topk_n_indices

In [None]:
(-sim_scores).sort_values()[:topk].index

In [None]:
sim_scores[5, 94, 12, 82, 22, 98, 99, 59,  9, 15]

In [None]:
a = torch.tensor(n_prizes_.to_arrow().to_pylist()).float()
a[:100]

In [None]:
sim_scores.to_arrow().to_pylist()

In [16]:
# Set the prizes for nodes based on the similarity scores
n_prizes = torch.tensor(scores, dtype=torch.float32)
topk = min(topk, graph.num_nodes)
_, topk_n_indices = torch.topk(n_prizes, topk, largest=True)
# n_prizes = torch.zeros_like(n_prizes)
# n_prizes[topk_n_indices] = torch.arange(topk, 0, -1).float()

NameError: name 'scores' is not defined

In [None]:
min(topk, graph.num_nodes)

In [None]:
scores_cp = cp.asarray(scores, dtype=cp.float32)
topk = min(topk, nodes_df.shape[0])
cuvs.selection.select_k(scores_cp, k=topk, select_type='kth_largest')

In [None]:
n_prizes

In [None]:
n_prizes

In [None]:
scores

In [None]:
sim_scores[:10]

In [None]:

node_mask = (graph.node_type == modality)
len(graph.x[node_mask])

In [None]:
cudf.Series(initial_graph["pyg"].node_type) == 'gene/protein'

In [None]:
scores[:10], scores_new[:10]

In [None]:
graph_df.loc[graph_df["node_type"] == modality,
                "score"] = torch.nn.CosineSimilarity(dim=-1)(
        query_emb,
        torch.tensor(list(graph_df[graph_df["node_type"]== modality].x.values))
    ).tolist()

In [None]:
graph = initial_graph["pyg"]
text_emb = torch.tensor(query_df.iloc[0]['desc_x'][0])
query_emb = torch.tensor(query_df.iloc[0]['x'][0])
modality = query_df.iloc[0]['node_type'][0]

# Compute prizes for nodes
n_prizes = _compute_node_prizes(graph, query_emb, modality)

# Compute prizes for edges
e_prizes = _compute_edge_prizes(graph, text_emb)

In [None]:
# Initialize the subgraph dictionary
subgraphs = {}
subgraphs["nodes"] = []
subgraphs["edges"] = []

# Loop over query embeddings and modalities
for q in query_df.to_pandas().iterrows():
    # Prepare the PCSTPruning object and extract the subgraph
    # Parameters were set in the configuration file obtained from Hydra
    subgraph = MultimodalPCSTPruning(
        topk=state["topk_nodes"],
        topk_e=state["topk_edges"],
        cost_e=cfg.cost_e,
        c_const=cfg.c_const,
        root=cfg.root,
        num_clusters=cfg.num_clusters,
        pruning=cfg.pruning,
        verbosity_level=cfg.verbosity_level,
        use_description=q[1]['use_description'],
    ).extract_subgraph(pyg_graph,
                        torch.tensor(q[1]['desc_x']), # description embedding
                        torch.tensor(q[1]['x']), # modal-specific embedding
                        q[1]['node_type'])

    # Append the extracted subgraph to the dictionary
    subgraphs["nodes"].append(subgraph["nodes"].tolist())
    subgraphs["edges"].append(subgraph["edges"].tolist())

# Concatenate and get unique node and edge indices
subgraphs["nodes"] = np.unique(
    np.concatenate([np.array(list_) for list_ in subgraphs["nodes"]])
)
subgraphs["edges"] = np.unique(
    np.concatenate([np.array(list_) for list_ in subgraphs["edges"]])
)

In [None]:
# Before optimiziation
before_ = subgraphs
before_