Load environments

In [1]:
import sys
sys.path.append('/home/svu/e0315913/.local/lib/python3.8/site-packages')
sys.path.append('/home/svu/e0315913/.local/bin')
sys.path.append("/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/config/demo_config.yaml")

import os
os.chdir('/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/')

Import libraries

In [2]:
import random, torch
import numpy as np
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm

In [3]:
from src.utils.config import load_config, validate_config
from src.utils.evaluation import evaluate
from src.models.alpha import FullOutput, Metrics, threshold_based_candidates, calculate_avg_metrics
from src.my_datasets.kgqa_dataset import KGQADataset
from src.my_datasets.data_utils import collate_fn
from src.models.rgcn_model import RGCNModel

  warn(f"Failed to load image Python extension: {e}")


In [4]:
# from src.RAG.kgqa_extractor import extract_subgraph_qemb, load_all_metadata, load_subgraph_data, save_all_to_file, save_subg_qemb_file

In [5]:
from src.models.alpha import Output

Set config and device

In [6]:
CONFIG_PATH = "config/demo_config.yaml"

config = load_config(CONFIG_PATH)
required_keys = [
    'model','train', 'node_embed', 'idxes',
    'train_qa_data', 'test_qa_data', 'num_hops',
]
validate_config(config, required_keys)

In [7]:
torch.manual_seed(2024)
random.seed(2024)
np.random.seed(2024)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
config['train']['start_idx']

86

Load Data

In [9]:
import json, random, torch
import pandas as pd

from torch_geometric.utils import k_hop_subgraph, subgraph
from torch_geometric.utils.convert import from_networkx
from torch_geometric.data import Data
import networkx as nx

from sentence_transformers import util, SentenceTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_folder = '../hf_model' ### you might need to modify this
model = SentenceTransformer("/hpctmp/e0315913/CS5284_Project/GNN-cluster/src/models/models--sentence-transformers--multi-qa-MiniLM-L6-cos-v1/snapshots/2d981ed0b0b8591b038d472b10c38b96016aab2e")
# model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

# KGQADataset

In [102]:
class KGQADataset(torch.utils.data.Dataset):
    def __init__(self, path_to_node_embed, path_to_idxes, path_to_qa, path_to_kb, from_paths_activate, entity_sbert, k=3):
        """
        Initialize without precomputed subgraphs. Computes k-hop subgraphs on-the-fly.
        """
        self.from_paths_activate = from_paths_activate
        self.entity_sbert = entity_sbert

        if self.from_paths_activate:
            # Load the main graph data
            self.G = self.generate_nx_graph(path_to_kb)

        # Load the main graph data
        self.loaded_entity_to_idx, self.loaded_edge_index, self.loaded_relations = self.load_data_json(path_to_idxes)
        self.data = self.create_data_object(self.loaded_edge_index, self.loaded_relations, self.loaded_entity_to_idx)

        # Store the global number of unique relations
        self.num_relations = len(set(self.loaded_relations))
        self.k = k

        # Load node2vec and sbert embeddings
        self.entity_sbert_embeddings = self.get_entity_sbert_embeddings(self.loaded_entity_to_idx)
        self.node2vec_embeddings = self.load_node2vec_embeddings(path_to_node_embed)

        # Load question and answer data
        self.df = pd.read_csv(path_to_qa, sep='\t', header=None, names=['question', 'answer'])
        self.df['answer'] = self.df['answer'].apply(lambda x: x.split("|"))

        # Load sentence embeddings
        self.q_embeddings = model.encode(
            [q.replace("[", "").replace("]", "") for q in self.df['question']],
            batch_size=128,
            convert_to_tensor=True
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the question and answers from the DataFrame
        row = self.df.iloc[idx]
        question, answers = row['question'], row['answer']

        # Step 1: Extract the entity from the question (entity marked in square brackets)
        entity = self.extract_entity_from_question(question)
        entity_node = self.loaded_entity_to_idx[entity]

        # Step 2: Get the question embedding
        question_embedding = self.q_embeddings[idx]

        # Step 3: Compute the k-hop subgraph around the entity dynamically
        subset_node_indices, sub_edge_index, _, edge_mask = self.get_k_hop_subgraph(entity_node)

        # Step 4: Construct the subgraph based on these subset indices
        subgraph_data, node_map = self.construct_subgraph(subset_node_indices, sub_edge_index, edge_mask)

        # Step 5: Get the labels
        labels = self.get_labels(answers, node_map)

        # Step 6: Add node2vec embeddings to the subgraph data
        embedding_dim = 384 if (self.from_paths_activate or self.entity_sbert) else 64
        subgraph_data.x = self.get_node_embeddings(node_map, question_embedding, entity, embedding_dim)

        return subgraph_data, question_embedding, labels, node_map

    def get_entity_sbert_embeddings(self, loaded_entity_to_idx):
        # entity names as keys
        entities = list(loaded_entity_to_idx.keys())
        encoded_values = model.encode(entities, batch_size=128, convert_to_tensor=False).tolist()
        out = {e: encoded_values[i] for i, e in enumerate(entities)}
        return out

    def get_k_hop_subgraph(self, entity_node):
        """
        Compute the k-hop subgraph dynamically for a given entity node.
        """
        subset, sub_edge_index, mapping, edge_mask = k_hop_subgraph(
            node_idx=entity_node,
            num_hops=self.k,
            edge_index=self.data.edge_index,
            relabel_nodes=True
        )
        return subset, sub_edge_index, mapping, edge_mask

    def construct_subgraph(self, subset_node_indices, sub_edge_index, edge_mask):
        """
        Construct a subgraph Data object for the given subset of nodes and edges.
        """
        node_map = {old_idx.item(): new_idx for new_idx, old_idx in enumerate(subset_node_indices)}

        # # Create subgraph data object
        sub_edge_attr = self.data.edge_attr[edge_mask]
        subgraph_data = Data(edge_index=sub_edge_index, edge_attr=sub_edge_attr) # sub_edge_index is get_k_hop_subgraph's sub_edge_index
        return subgraph_data, node_map

    def load_data_json(self, filename):
        with open(filename, 'r') as f:
            data = json.load(f)
        return data['entity_to_idx'], data['edge_index'], data['relations']

    def load_data_pt(self, filename):
        """
        Load the list of subsets (k-hop subgraph node indices) from the .pt file.
        """
        data = torch.load(filename)
        if isinstance(data, list):
            return data  # Return the list of subsets
        else:
            raise ValueError("Expected a list of k-hop subgraph node indices.")

    def create_data_object(self, edge_index, relations, entity_to_idx):
        unique_relations = list(set(relations))
        relation_mapping = {relation: index for index, relation in enumerate(unique_relations)}

        edge_index = torch.tensor(edge_index).t().contiguous()
        # Make the graph undirected by adding reverse edges
        undirected_edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1) # comment out if want directed
        edge_attr = torch.tensor([relation_mapping[rel] for rel in relations])
        # Since we now have more edges (two for each undirected edge), we concat them
        undirected_edge_attr = torch.cat([edge_attr, edge_attr], dim=0) # comment out if want directed

        # Data(edge_index=undirected_edge_index, edge_attr=undirected_edge_attr, num_nodes=len(entity_to_idx))
        # Data(edge_index=edge_index, edge_attr=edge_attr, num_nodes=len(entity_to_idx))
        return Data(edge_index=undirected_edge_index, edge_attr=undirected_edge_attr, num_nodes=len(entity_to_idx))

    def load_node2vec_embeddings(self, file_path, embedding_dim=64):
        embeddings_dict = {}

        with open(file_path, 'r') as f:
            # Skip the first row
            next(f)

            for line in f:
                parts = line.strip().split()

                # The entity is everything before the embedding, so we use -embedding_dim
                entity = " ".join(parts[:-embedding_dim])  # Join all words before the embedding dimensions
                embedding = list(map(float, parts[-embedding_dim:]))  # Convert last parts to float

                # Store in the dictionary
                embeddings_dict[entity] = embedding

        return embeddings_dict

    def extract_entity_from_question(self, question):
        """
        Extract the entity that is enclosed in square brackets from the question.
        Example: "What city is [Paris] the capital of?" -> "Paris"
        """
        # assumes one entity of interest in each questin
        start = question.find('[') + 1
        end = question.find(']')
        if start == 0 or end == -1:
            raise ValueError(f"No entity found in the question: {question}")
        return question[start:end]

    # def get_k_hop_subgraph(self, node_idx):
    #     """
    #     Get the k-hop subgraph centered around the given node index.
    #     node_idx (int): Index of the node representing the entity.
    #     """
    #     # Extract k-hop subgraph from the full graph
    #     node_idx = torch.tensor([node_idx], dtype=torch.long)
    #     subset, sub_edge_index, _, _ = k_hop_subgraph(
    #         node_idx=node_idx,
    #         num_hops=self.k,
    #         edge_index=self.data.edge_index,
    #         relabel_nodes=True
    #     )

    #     # Create a subgraph Data object
    #     # subgraph = Data(x=self.data.x[subset], edge_index=sub_edge_index)
    #     subgraph = Data(edge_index=sub_edge_index)

    #     # Create a mapping from original node indices to subgraph indices
    #     node_map = {original_idx.item(): new_idx for new_idx, original_idx in enumerate(subset)}

    #     return subgraph, node_map

    def get_labels(self, answers, node_map):
        labels = torch.zeros(len(node_map), dtype=torch.long)
        for ans in answers:
            if ans in self.loaded_entity_to_idx:
                ans_idx = self.loaded_entity_to_idx[ans]
                if ans_idx in node_map:
                    labels[node_map[ans_idx]] = 1
        return labels

    # def get_node_embeddings(self, node_map):
    #     embeddings = [[0.0] * len(self.node2vec_embeddings[next(iter(self.node2vec_embeddings))])]*len(node_map)
    #     idx_to_entity = {v: k for k, v in self.loaded_entity_to_idx.items()}

    #     for ori, new in node_map.items():
    #         if idx_to_entity[ori] in self.node2vec_embeddings:
    #             embeddings[new] = self.node2vec_embeddings[idx_to_entity[ori]]
    #     return torch.tensor(embeddings, dtype=torch.float)

    def get_node_embeddings(self, node_map, question_embedding, entity, embedding_dim, random_init=False):
        """
        Get node embeddings from node2vec. If node not found or random_init is True, create random embeddings.

        Args:
            node_map: Mapping from original graph node indices to subgraph indices.
            question_embedding: Embedding for the query entity.
            entity: Query entity index.
            embedding_dim: Dimensionality of the embeddings.
            random_init: Whether to use random initialization for missing embeddings.

        Returns:
            torch.Tensor: Node embeddings for the subgraph.
        """
        # Initialize all embeddings with independent zero vectors (not references)
        embeddings = [np.zeros(embedding_dim).tolist() for _ in range(len(node_map))]
        idx_to_entity = {v: k for k, v in self.loaded_entity_to_idx.items()}  # Reverse map

        if self.from_paths_activate:
            # Get embeddings from paths
            result = self.find_best_embedding(self.G, entity, question_embedding)
            result[entity] = question_embedding  # Assign question embedding to query entity

        for ori, new in node_map.items():
            if random_init:
                # Randomly initialize embeddings if required
                embeddings[new] = [random.uniform(-0.1, 0.1) for _ in range(embedding_dim)]
            elif ori in idx_to_entity and idx_to_entity[ori] in self.node2vec_embeddings:
                # Ensure the node index maps correctly and exists in node2vec
                if self.entity_sbert:
                    # Use entity SBERT embeddings
                    embeddings[new] = self.entity_sbert_embeddings[idx_to_entity[ori]].tolist()
                elif self.from_paths_activate:
                    # Use path-based SBERT embeddings
                    embeddings[new] = result[idx_to_entity[ori]].tolist()
                else:
                    # Use node2vec embeddings
                    embeddings[new] = self.node2vec_embeddings[idx_to_entity[ori]]

        # Convert the list of embeddings to a PyTorch tensor
        embeddings = np.array(embeddings)  # Convert to numpy array for consistency
        if embeddings.ndim != 2:
            raise ValueError(f"Embeddings array must be 2D but got {embeddings.ndim}D")

        return torch.tensor(embeddings, dtype=torch.float)

    def generate_nx_graph(self, path):
        """
        Constructs a networkx directed graph that includes both the original and reverse relations.
        Each edge includes the concatenated relations between the same pairs of entities.
        """
        df = pd.read_csv(path, sep='|', header=None, names=['entity1', 'relation', 'entity2'])

        # Remove duplicates
        df_unique = df.drop_duplicates() # 133582 edges after dedup

        # Define reverse relations and construct reverse edges
        reverse_relations = {
        'directed_by': 'directed',
        'written_by': 'written',
        'starred_actors': 'starring',
        'has_tags': 'is_tagged_to',
        'has_genre': 'is_genre_of',
        'has_imdb_rating': 'is_imdb_rating_of',
        'has_imdb_votes': 'is_imdb_votes_of',
        'in_language': 'language_of',
        'release_year': 'is_released_year_of'
        }

        reverse_rows = []
        for index, row in df_unique.iterrows():
            reverse_relation = reverse_relations[row['relation']]
            reverse_row = {'entity1': row['entity2'], 'relation': reverse_relation, 'entity2': row['entity1']}
            reverse_rows.append(reverse_row)

        df_reverse = pd.DataFrame(reverse_rows) # 133582 edges
        df_combined = pd.concat([df_unique, df_reverse], ignore_index=True) # 267164 edges

        # This step consolidates multiple edges between the same pair of entities into a single edge.
        # It concatenates all relation values associated with each pair of entities.
        df_final = df_combined.groupby(['entity1', 'entity2'], as_index=False).agg({
            'relation': ' and '.join
        }) # 249349 edges

        # Replace underscores in relation names
        df_final['relation'] = df_final['relation'].str.replace('_', ' ')

        G = nx.from_pandas_edgelist(df_final, source='entity1', target='entity2', edge_attr='relation', create_using=nx.DiGraph())
        # Number of entities: 43234
        # Number of edges: 249349
        # Number of distinct relations: 38
        # Distinct relations: {'release year', 'directed by and written by and starred actors', 'directed by and starred actors', 'has imdb rating', 'is genre of', 'has tags and is tagged to', 'directed by and written by', 'starred actors and starring', 'directed', 'directed and written', 'has imdb votes', 'written by and directed by', 'written and directed', 'in language and language of', 'language of', 'has genre', 'is tagged to', 'has imdb rating and has tags', 'directed and written and starring', 'starred actors', 'starring', 'has tags', 'directed and starring', 'written by and directed by and starred actors', 'written by and written', 'written by', 'in language', 'release year and has tags', 'written and directed and starring', 'written and starring', 'is released year of and is tagged to', 'directed by', 'is imdb rating of', 'is imdb rating of and is tagged to', 'written', 'is released year of', 'written by and starred actors', 'is imdb votes of'}

        return G

    def find_paths(self, G, u, n):
        """
        Finds paths in a graph G starting from node u with until reaching a maximum length of n edges.

        Parameters:
        G (Graph): The nx graph where entities and relations are defined.
        u (str): The starting node for the paths.
        n (int): The maximum depth or length of paths in terms of edges.

        Returns:
        List of paths, where each path is a list of tuples (node, relation) representing
        the nodes and relations along the path.
        """

        if n == 0:
            return [[(u, None)]]

        paths = [
            [(u, G[u][neighbor]['relation'])] + path
            for neighbor in G.neighbors(u)
            for path in self.find_paths(G, neighbor, n - 1)
            if u not in [node for node, _ in path] # Avoid cycles
        ]
        return paths

    def find_best_embedding(self, G, query_entity, q_embedding):
        """
        Finds the best path embedding for each unique candidate based on cosine similarity.

        Parameters:
        G (Graph): The nx graph where entities and relations are defined.
        query_entity (str): The entity for which paths are being found.
        q_embedding (torch.Tensor): The embedding of the query entity.

        Returns:
        dict: A dictionary where keys are candidates and values are the best path embeddings.
        """

        paths = self.find_paths(G, query_entity, 2) + self.find_paths(G, query_entity, 1)

        sentences = []
        candidates = []

        for tuple_list in paths:
            # Extract the last entity (candidate) in the path
            candidate_entity = tuple_list[-1][0]

            if candidate_entity != query_entity: # Avoid looping back to the query_entity
                candidates.append(candidate_entity)
                # Create the sentence for the path
                sentence = ' '.join(f"{tup[0]} {tup[1]}" if tup[1] else tup[0] for tup in tuple_list)
                sentences.append(sentence)

        # Calculate path embeddings
        path_embeddings = model.encode(sentences, batch_size=128, convert_to_tensor=True)
        # Calculate cosine similarities
        cosine_scores = util.cos_sim(q_embedding, path_embeddings)[0]

        # Dictionary to store the best path embedding for each candidate
        best_embeddings = {}
        # Dictionary to store the highest cosine score for each candidate
        best_scores = {}

        for idx, candidate in enumerate(candidates):
            cosine_score = cosine_scores[idx].item()

            if candidate not in best_embeddings or cosine_score > best_scores[candidate]:
                best_scores[candidate] = cosine_score
                best_embeddings[candidate] = path_embeddings[idx]

        return best_embeddings

# Extraction Function

In [82]:
def extract_subgraph_qemb(
    dataloader, model, device, threshold_value, save_all_path, save_emb_path
):
    """
    Extract subgraphs, compute embeddings, and save processed results.

    Args:
        dataloader: DataLoader for the dataset.
        model: The model used to compute embeddings and similarity scores.
        device: Device (CPU or GPU) for computations.
        threshold_value: Default threshold for filtering candidates.
        save_all_path: Path to save all processed data.
        save_emb_path: Path to save subgraph and question embeddings.
    """
    model.eval()

    # Initialize lists to store processed data
    all_batched_subgraphs = []
    all_question_embeddings = []
    all_candidates_masks = []
    all_similarity_scores = []
    all_node_maps = []
    all_labels = []
    all_output_embeddings = []

    with torch.no_grad():
        for (
            batched_subgraphs,
            question_embeddings,
            stacked_labels,
            node_maps,
            labels,
        ) in tqdm(dataloader, desc="Extracting subgraph", leave=True):
            # Perform forward pass
            batched_subgraphs = batched_subgraphs.to(device)
            question_embeddings = question_embeddings.to(device)
            stacked_labels = stacked_labels.to(device)
            full_output = model(batched_subgraphs, question_embeddings)

            # Extract relevant outputs
            output = full_output.output if hasattr(full_output, "output") else full_output
            threshold = full_output.threshold if hasattr(full_output, "threshold") else threshold_value
            candidates_mask, similarity_score = threshold_based_candidates(output, threshold=threshold)
            output_embedding = (
                full_output.node_embedding if hasattr(full_output, "node_embedding") else output
            )

            # Store results as lists (detached from PyTorch tensors)
            all_batched_subgraphs.append(batched_subgraphs.x.tolist())
            all_question_embeddings.append(question_embeddings.tolist())
            all_candidates_masks.append(candidates_mask.tolist())
            all_node_maps.extend(node_maps)
            all_labels.extend(labels)
            all_output_embeddings.append(output_embedding.tolist())

            # Append similarity scores if available
            if similarity_score is not None:
                all_similarity_scores.append(similarity_score.tolist())
            else:
                print("Skipping batch with no similarity scores.")

    # Save embeddings and processed data
    save_subg_qemb_file(
        all_batched_subgraphs,
        all_question_embeddings,
        file_path=save_emb_path,
    )
    save_all_to_file(
        all_batched_subgraphs,
        all_question_embeddings,
        all_candidates_masks,
        all_similarity_scores,
        all_node_maps,
        all_labels,
        all_output_embeddings,
        file_path=save_all_path,
    )



In [85]:
def save_all_to_file(
    batched_subgraphs,
    question_embeddings,
    candidates_mask,
    similarity_scores,
    node_map,
    labels,
    all_output_embeddings,
    file_path,
):
    data = {
        "batched_subgraphs": batched_subgraphs,
        "question_embeddings": question_embeddings,
        "candidates_masks": candidates_mask,
        "similarity_scores": similarity_scores,  # Leave as tensor if not None
        "node_maps": node_map,
        "labels": labels,
        "all_output_embeddings": all_output_embeddings,
    }

    torch.save(data, file_path)


def save_subg_qemb_file(
    batched_subgraphs, question_embeddings, file_path
):

    data = {
        "batched_subgraphs": batched_subgraphs,
        "question_embeddings": question_embeddings,
    }

    torch.save(data, file_path)


def load_all_metadata(file_path):
    # Load the data from the saved file
    saved_data = torch.load(file_path)

    # Extract each component from the dictionary
    batched_subgraphs = saved_data["batched_subgraphs"]
    question_embeddings = saved_data["question_embeddings"]
    candidates_masks = saved_data["candidates_masks"]
    similarity_scores = saved_data.get("similarity_scores", None)
    node_maps = saved_data["node_maps"]
    labels = saved_data["labels"]
    all_output_embeddings = saved_data["all_output_embeddings"]

    return (
        batched_subgraphs,
        question_embeddings,
        candidates_masks,
        similarity_scores,
        node_maps,
        labels,
        all_output_embeddings,
    )


def load_subgraph_data(file_path):
    # Load the data from the saved file
    saved_data = torch.load(file_path)

    # Extract each component from the dictionary
    batched_subgraphs = saved_data["batched_subgraphs"]
    question_embeddings = saved_data["question_embeddings"]

    return batched_subgraphs, question_embeddings


## Test 2

In [75]:
train_dataset = KGQADataset(
    path_to_node_embed=config['node_embed'],
    path_to_idxes=config['idxes'],
    path_to_qa=config['train_qa_data'],
    path_to_kb=config['raw_kb'],
    from_paths_activate=True,
    entity_sbert=False,
    k=config['num_hops']
)

In [76]:
num_relations = train_dataset.num_relations # extract the num_relation from the entire graph
sub_train_dataset = Subset(train_dataset, list(range(config['train']['start_idx'],config['train']['end_idx'])))

train_loader = DataLoader(
    sub_train_dataset,
    batch_size=config['train']['batch_size'],
    collate_fn=collate_fn,
    shuffle=True
)

In [77]:
model_test2 = RGCNModel(
            node_dim=config['model']['in_channels'],
            question_dim=train_dataset.q_embeddings.size(-1),
            hidden_dim=config['model']['hidden_channels'],
            num_relations=38,
            output_dim=config['model']['out_channels'],
            num_rgcn=config['model']['num_layers'],
            reduced_qn_dim=config['model']['reduced_qn_dim'],
            reduced_node_dim=config['model']['reduced_node_dim'],
            output_embedding=config['model']['output_embedding'],
            use_residuals=config['model']['use_residuals']
        )

checkpoint = torch.load(config['model_path_test2'])
model_test2.load_state_dict(checkpoint['model_state_dict'])
model_test2 = model_test2.to(device)

  checkpoint = torch.load(config['model_path_test2'])


In [78]:
equal_subgraph_weighting = config['train']['equal_subgraph_weighting']
threshold_value = config['threshold_value']
hits_at_k = config['train']['hits_at_k']

save_all_path = config['save_all_path_test2']
save_emb_path = config['save_emb_path_test2']

In [86]:
extract_subgraph_qemb(train_loader, model_test2, device, threshold_value, save_all_path, save_emb_path)

Extracting subgraph: 100%|██████████| 1/1 [00:00<00:00, 12.42it/s]


In [157]:
model_test2.eval()

RGCNModel(
  (fc_reduce_qn): Linear(in_features=384, out_features=64, bias=True)
  (reduce_node_dim_layer): RGCNConv(384, 64, num_relations=38)
  (input_layer): RGCNConv(128, 64, num_relations=38)
  (rgcn_layers): ModuleList(
    (0): RGCNConv(64, 64, num_relations=38)
  )
  (output_layer): RGCNConv(64, 384, num_relations=38)
)

In [158]:
all_batched_subgraphs = []
all_question_embeddings = []
all_candidates_masks = []
all_similarity_scores = []
all_node_maps = []
all_labels = []
all_output_embeddings = []


In [159]:
with torch.no_grad():
    for (
        batched_subgraphs,
        question_embeddings,
        stacked_labels,
        node_maps,
        labels,
    ) in tqdm(train_loader, desc="Extracting subgraph", leave=True):
        batched_subgraphs = batched_subgraphs.to(device)
        question_embeddings = question_embeddings.to(device)
        stacked_labels = stacked_labels.to(device)
        node_maps = node_maps
        labels = labels

Extracting subgraph: 100%|██████████| 1/1 [00:00<00:00, 47.57it/s]


In [160]:
full_output = model_test2(batched_subgraphs, question_embeddings)

In [161]:
full_output

Output(node_embedding=tensor([[-0.0455, -0.4252,  0.4711,  ..., -0.1344,  0.3125, -0.2147],
        [ 0.2930, -0.4726, -0.2477,  ...,  0.0713,  0.6241, -0.4596],
        [ 0.2495, -0.4304, -0.2512,  ..., -0.1379,  0.5478, -0.3119],
        ...,
        [ 0.2564,  0.4636,  0.8000,  ...,  0.3100, -0.3889,  0.2299],
        [ 0.1730, -0.6759,  1.0974,  ..., -0.3653, -0.3239,  0.3315],
        [ 0.5554, -0.0330,  0.1109,  ...,  0.1442,  0.1015,  0.1431]],
       device='cuda:0', grad_fn=<AddBackward0>), question_embedding_expanded=tensor([[-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        ...,
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643]],
       device='cuda:0'))

In [162]:
output = (full_output.output if hasattr(full_output, "output") else full_output)

In [163]:
output

Output(node_embedding=tensor([[-0.0455, -0.4252,  0.4711,  ..., -0.1344,  0.3125, -0.2147],
        [ 0.2930, -0.4726, -0.2477,  ...,  0.0713,  0.6241, -0.4596],
        [ 0.2495, -0.4304, -0.2512,  ..., -0.1379,  0.5478, -0.3119],
        ...,
        [ 0.2564,  0.4636,  0.8000,  ...,  0.3100, -0.3889,  0.2299],
        [ 0.1730, -0.6759,  1.0974,  ..., -0.3653, -0.3239,  0.3315],
        [ 0.5554, -0.0330,  0.1109,  ...,  0.1442,  0.1015,  0.1431]],
       device='cuda:0', grad_fn=<AddBackward0>), question_embedding_expanded=tensor([[-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        ...,
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643],
        [-0.0327, -0.1177,  0.0200,  ..., -0.0601,  0.0206, -0.0643]],
       device='cuda:0'))

In [164]:
with torch.no_grad():
    for (
        batched_subgraphs,
        question_embeddings,
        stacked_labels,
        node_maps,
        labels,
    ) in tqdm(train_loader, desc="Extracting subgraph", leave=True):
            batched_subgraphs = batched_subgraphs.to(device)
            question_embeddings = question_embeddings.to(device)
            stacked_labels = stacked_labels.to(device)
            
            print(batched_subgraphs)
            print(question_embeddings)
            br
            # Perform forward pass
            full_output = model(batched_subgraphs, question_embeddings)
            output = (
                full_output.output if hasattr(full_output, "output") else full_output
            )
            threshold = (
                full_output.threshold
                if hasattr(full_output, "threshold")
                else threshold_value
            )

            # Calculate similarity scores and candidates mask
            candidates_mask, similarity_score = threshold_based_candidates(
                output, threshold=threshold
            )
            output_embedding = (
                full_output.node_embedding
                if isinstance(full_output, Output)
                else full_output
            )
            print(candiates_mask, similarity_score)

Extracting subgraph:   0%|          | 0/1 [00:00<?, ?it/s]

DataBatch(edge_index=[2, 242], edge_attr=[242], num_nodes=106, x=[106, 384], batch=[106], ptr=[2])
tensor([[-3.2683e-02, -1.1771e-01,  2.0035e-02, -4.9907e-03, -7.1972e-02,
          3.1428e-02,  3.0334e-02,  3.4106e-02,  4.0418e-03,  3.7856e-03,
         -7.8110e-03,  2.4622e-02,  6.5852e-02, -1.8142e-02,  2.8830e-02,
          2.6791e-02,  7.9914e-02,  8.3346e-02,  1.0563e-01,  2.4452e-02,
         -2.0814e-02,  2.2093e-02, -1.4475e-02,  2.5309e-02,  5.3784e-03,
         -2.8438e-02,  5.9495e-02,  3.7550e-02,  1.2722e-02,  7.6392e-02,
         -2.0498e-02,  1.3107e-01, -3.6708e-02, -3.5629e-02,  4.9601e-03,
          3.7087e-02, -6.4176e-02, -4.6361e-02, -5.4901e-02, -3.0928e-02,
          7.8183e-03,  3.1838e-02,  1.5287e-02,  5.9942e-02,  3.6168e-03,
         -6.5001e-02, -4.7575e-02, -2.6547e-02, -6.5822e-03, -5.9673e-02,
         -1.4633e-02,  2.7034e-02,  2.7769e-02, -2.5018e-02, -3.8412e-02,
          9.1150e-03, -1.2804e-02, -4.1930e-03,  6.5810e-02, -2.5715e-02,
         -9.6




NameError: name 'br' is not defined

In [165]:
threshold = (
                full_output.threshold
                if hasattr(full_output, "threshold")
                else threshold_value
            )

In [166]:
threshold

0.5

In [167]:
candidates_mask, similarity_score = threshold_based_candidates(
                output, threshold=threshold
            )

In [168]:
candidates_mask.shape

torch.Size([106])

In [169]:
similarity_score.shape

torch.Size([106])

In [170]:
output_embedding = (
                full_output.node_embedding
                if isinstance(full_output, Output)
                else full_output
            )

In [171]:
all_batched_subgraphs.append(batched_subgraphs.x.detach().cpu())
all_question_embeddings.append(question_embeddings.detach().cpu())
all_candidates_masks.append(candidates_mask.detach().cpu())
all_node_maps.extend(node_maps)
all_labels.extend(labels)
all_output_embeddings.append(output_embedding.detach().cpu())

In [172]:
# Only append similarity scores if they are not None
if similarity_score is not None:
    all_similarity_scores.append(
        similarity_score.detach().cpu()
    )  # Ensures tensor format
else:
    print("Skipping batch with no similarity scores.")

In [173]:
all_batched_subgraphs = torch.cat(all_batched_subgraphs, dim=0)
all_question_embeddings = torch.cat(all_question_embeddings, dim=0)
all_candidates_masks = torch.cat(all_candidates_masks, dim=0)
all_similarity_scores = (
    torch.cat(all_similarity_scores, dim=0) if all_similarity_scores else None
)

original_graph_embeddings = map_subgraph_to_original_graph(
    all_batched_subgraphs, all_node_maps
)
all_output_embeddings = torch.cat(all_output_embeddings, dim=0)


In [174]:
all_candidates_masks

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=torch.int32)

In [175]:
all_similarity_scores

tensor([ 0.0470,  0.2594,  0.2715,  0.2763,  0.1348,  0.5174,  0.4842,  0.4586,
         0.4814,  0.4827,  0.4694,  0.4889,  0.4850,  0.1330,  0.1489,  0.2582,
         0.3028,  0.3048,  0.2145, -0.0309,  0.2114,  0.1474,  0.2117,  0.2082,
         0.4793,  0.4766,  0.4757,  0.4752,  0.1033,  0.1440,  0.1742,  0.1196,
         0.2001,  0.3466,  0.3423,  0.3462,  0.3400,  0.5340,  0.5294,  0.5325,
         0.4752,  0.5357,  0.5353,  0.4834,  0.5334,  0.0752,  0.1858,  0.0852,
         0.1856,  0.1779,  0.1915,  0.4193,  0.4163,  0.4133,  0.4169,  0.4186,
         0.4149,  0.4100,  0.4194,  0.4137,  0.4159,  0.0997,  0.2453, -0.0245,
         0.2502,  0.2592,  0.2608,  0.2224,  0.4753,  0.5113,  0.4780,  0.4746,
         0.4692,  0.4735,  0.4782,  0.4835,  0.4726,  0.1224,  0.2754,  0.2620,
         0.2604,  0.2668,  0.2652,  0.3584,  0.3630,  0.3505,  0.3687,  0.3686,
         0.3714,  0.3611,  0.3589,  0.3739,  0.3590,  0.3591,  0.3500,  0.3640,
         0.3605,  0.3602,  0.2024,  0.16

In [176]:
all_candidates_masks

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=torch.int32)

In [177]:
all_node_maps

[{9445: 0,
  21663: 1,
  21665: 2,
  369: 3,
  60: 4,
  21666: 5,
  21667: 6,
  21668: 7,
  20389: 8,
  21669: 9,
  17136: 10,
  379: 11,
  17131: 12,
  55: 13,
  180: 14,
  27224: 15,
  28592: 16,
  28865: 17,
  30966: 18,
  82: 19,
  20305: 20,
  341: 21,
  18597: 22,
  17714: 23,
  17594: 24,
  23124: 25,
  16665: 26,
  35541: 27,
  8085: 28,
  42: 29,
  39123: 30,
  68: 31,
  281: 32,
  24995: 33,
  21968: 34,
  16389: 35,
  16390: 36,
  42128: 37,
  22941: 38,
  28426: 39,
  16463: 40,
  16394: 41,
  16395: 42,
  18976: 43,
  20068: 44,
  8550: 45,
  16772: 46,
  34: 47,
  18088: 48,
  27513: 49,
  28291: 50,
  16404: 51,
  17078: 52,
  33214: 53,
  16675: 54,
  31193: 55,
  17305: 56,
  19252: 57,
  28828: 58,
  128: 59,
  359: 60,
  10: 61,
  18587: 62,
  2: 63,
  21277: 64,
  17296: 65,
  16660: 66,
  137: 67,
  30383: 68,
  22037: 69,
  18591: 70,
  20049: 71,
  25564: 72,
  30384: 73,
  21278: 74,
  17311: 75,
  352: 76,
  197: 77,
  29022: 78,
  194: 79,
  29461: 80,
  17071

In [149]:
high_similarity_nodes = []
for batch_idx, (mask, node_map) in enumerate(zip(all_candidates_masks, all_node_maps)):
    print(all_candidates_masks)
    print(node_map)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=torch.int32)
{9445: 0, 21663: 1, 21665: 2, 369: 3, 60: 4, 21666: 5, 21667: 6, 21668: 7, 20389: 8, 21669: 9, 17136: 10, 379: 11, 17131: 12, 55: 13, 180: 14, 27224: 15, 28592: 16, 28865: 17, 30966: 18, 82: 19, 20305: 20, 341: 21, 18597: 22, 17714: 23, 17594: 24, 23124: 25, 16665: 26, 35541: 27, 8085: 28, 42: 29, 39123: 30, 68: 31, 281: 32, 24995: 33, 21968: 34, 16389: 35, 16390: 36, 42128: 37, 22941: 38, 28426: 39, 16463: 40, 16394: 41, 16395: 42, 18976: 43, 20068: 44, 8550: 45, 16772: 46, 34: 47, 18088: 48, 27513: 49, 28291: 50, 16404: 51, 17078: 52, 33214: 53, 16675: 54, 31193: 55, 17305: 56, 19252: 57, 28828: 58, 128: 59, 359:

In [146]:
high_similarity_nodes

[[9445]]

In [147]:
print(candidate_indices)

tensor([0])


In [None]:
    model.eval()

    all_batched_subgraphs = []
    all_question_embeddings = []
    all_candidates_masks = []
    all_similarity_scores = []
    all_node_maps = []
    all_labels = []
    all_output_embeddings = []

    with torch.no_grad():
        for (
            batched_subgraphs,
            question_embeddings,
            stacked_labels,
            node_maps,
            labels,
        ) in tqdm(dataloader, desc="Extracting subgraph", leave=True):
            # Move tensors to the specified device
            batched_subgraphs = batched_subgraphs.to(device)
            question_embeddings = question_embeddings.to(device)
            stacked_labels = stacked_labels.to(device)

            # Perform forward pass
            full_output = model(batched_subgraphs, question_embeddings)
            output = (
                full_output.output if hasattr(full_output, "output") else full_output
            )
            threshold = (
                full_output.threshold
                if hasattr(full_output, "threshold")
                else threshold_value
            )

            # Calculate similarity scores and candidates mask
            candidates_mask, similarity_score = threshold_based_candidates(
                output, threshold=threshold
            )
            output_embedding = (
                full_output.node_embedding
                if isinstance(full_output, Output)
                else full_output
            )

            # Store batched data
            all_batched_subgraphs.append(batched_subgraphs.x.detach().cpu())
            all_question_embeddings.append(question_embeddings.detach().cpu())
            all_candidates_masks.append(candidates_mask.detach().cpu())
            all_node_maps.extend(node_maps)
            all_labels.extend(labels)
            all_output_embeddings.append(output_embedding.detach().cpu())

            # Only append similarity scores if they are not None
            if similarity_score is not None:
                all_similarity_scores.append(
                    similarity_score.detach().cpu()
                )  # Ensures tensor format
            else:
                print("Skipping batch with no similarity scores.")

    # Concatenate all batched data along the 0-axis (vertically)
    all_batched_subgraphs = torch.cat(all_batched_subgraphs, dim=0)
    all_question_embeddings = torch.cat(all_question_embeddings, dim=0)
    all_candidates_masks = torch.cat(all_candidates_masks, dim=0)
    all_similarity_scores = (
        torch.cat(all_similarity_scores, dim=0) if all_similarity_scores else None
    )

    original_graph_embeddings = map_subgraph_to_original_graph(
        all_batched_subgraphs, all_node_maps
    )
    all_output_embeddings = torch.cat(all_output_embeddings, dim=0)
    
    
    high_similarity_nodes = []
    for batch_idx, (mask, node_map) in enumerate(zip(all_candidates_masks, all_node_maps)):
        # Get the indices of high-similarity candidates in the subgraph
        candidate_indices = torch.nonzero(mask, as_tuple=True)[0]  # Indices where mask is True

        # Map the subgraph indices to original graph indices using node_map
        batch_high_similarity_nodes = [node_map[idx.item()] for idx in candidate_indices]
        high_similarity_nodes.append(batch_high_similarity_nodes)
    
    
    save_subg_qemb_file(
        all_batched_subgraphs,
        original_graph_embeddings,
        all_question_embeddings,
        file_path=save_emb_path,
    )
    save_all_to_file(
        all_batched_subgraphs,
        original_graph_embeddings,
        all_question_embeddings,
        all_candidates_masks,
        all_similarity_scores,
        all_node_maps,
        all_labels,
        all_output_embeddings,
        high_similarity_nodes,
        file_path=save_all_path,
    )
    

## Test 8

In [15]:
def save_all_to_file(
    batched_subgraphs,
    original_graph_embeddings,
    question_embeddings,
    candidates_mask,
    similarity_scores,
    node_map,
    labels,
    all_output_embeddings,
    file_path,
):
    data = {
        "batched_subgraphs": batched_subgraphs,
        "original_graph_embeddings": original_graph_embeddings,
        "question_embeddings": question_embeddings,
        "candidates_masks": candidates_mask,
        "similarity_scores": similarity_scores,  # Leave as tensor if not None
        "node_maps": node_map,
        "labels": labels,
        "all_output_embeddings" : all_output_embeddings
    }

    torch.save(data, file_path)


def save_subg_qemb_file(
    batched_subgraphs, original_graph_embeddings, question_embeddings, file_path
):

    data = {
        "batched_subgraphs": batched_subgraphs,
        "original_graph_embeddings": original_graph_embeddings,
        "question_embeddings": question_embeddings,
    }

    torch.save(data, file_path)

def extract_subgraph_qemb(dataloader, model, device, threshold_value, save_all_path, save_emb_path):
    model.eval()

    all_batched_subgraphs = []
    all_question_embeddings = []
    all_candidates_masks = []
    all_similarity_scores = [] 
    all_node_maps = []
    all_labels = []
    all_output_embeddings = []

    with torch.no_grad():
        for (
            batched_subgraphs,
            question_embeddings,
            stacked_labels,
            node_maps,
            labels,
        ) in tqdm(dataloader, desc="Extracting subgraph", leave=True):
            print(node_maps)
            # Move tensors to the specified device
#             batched_subgraphs = batched_subgraphs.to(device)
#             question_embeddings = question_embeddings.to(device)
#             stacked_labels = stacked_labels.to(device)

#             # Perform forward pass
#             full_output = model(batched_subgraphs, question_embeddings)
#             output = (
#                 full_output.output if hasattr(full_output, "output") else full_output
#             )
#             threshold = (
#                 full_output.threshold
#                 if hasattr(full_output, "threshold")
#                 else threshold_value
#             )

#             # Calculate similarity scores and candidates mask
#             candidates_mask, similarity_score = threshold_based_candidates(
#                 output, threshold=threshold
#             )
#             output_embedding = full_output.node_embedding if isinstance(full_output, Output) else full_output


#             # Store batched data
#             all_batched_subgraphs.append(batched_subgraphs.x.detach().cpu())
#             all_question_embeddings.append(question_embeddings.detach().cpu())
#             all_candidates_masks.append(candidates_mask.detach().cpu())
#             all_node_maps.extend(node_maps)
#             all_labels.extend(labels)
#             all_output_embeddings.append(output_embedding.detach().cpu())

#             # Only append similarity scores if they are not None
#             if similarity_score is not None:
#                 all_similarity_scores.append(
#                     similarity_score.detach().cpu()
#                 )  # Ensures tensor format
#             else:
#                 print("Skipping batch with no similarity scores.")

    # Concatenate all batched data along the 0-axis (vertically)
    all_batched_subgraphs = torch.cat(all_batched_subgraphs, dim=0)
    all_question_embeddings = torch.cat(all_question_embeddings, dim=0)
    all_candidates_masks = torch.cat(all_candidates_masks, dim=0)
    all_similarity_scores = (
        torch.cat(all_similarity_scores, dim=0) if all_similarity_scores else None
    )

    original_graph_embeddings = map_subgraph_to_original_graph(all_batched_subgraphs, all_node_maps)
    all_output_embeddings = torch.cat(all_output_embeddings, dim=0)
    print(all_output_embeddings)
    save_subg_qemb_file(
        all_batched_subgraphs, original_graph_embeddings, all_question_embeddings, file_path=save_emb_path
    )
    save_all_to_file(
        all_batched_subgraphs,
        original_graph_embeddings,
        all_question_embeddings,
        all_candidates_masks,
        all_similarity_scores,
        all_node_maps,
        all_labels,
        all_output_embeddings,
        file_path=save_all_path,
    )


def map_subgraph_to_original_graph(all_batched_subgraphs, all_node_maps):
    original_graph_embeddings = {}

    start_index = 0
    for node_map in all_node_maps:
        subgraph_size = len(node_map)
        subgraph_embeddings = all_batched_subgraphs[start_index : start_index + subgraph_size]
        
        for original_idx, subgraph_idx in node_map.items():
            original_graph_embeddings[original_idx] = subgraph_embeddings[subgraph_idx]

        start_index += subgraph_size

    return original_graph_embeddings




In [65]:
train_dataset_1 = KGQADataset(
    path_to_node_embed=config['node_embed'],
    path_to_idxes=config['idxes'],
    path_to_qa=config['train_qa_data'],
    path_to_kb=config['raw_kb'],
    from_paths_activate=False,
    entity_sbert=True,
    k=config['num_hops']
)

num_relations_1 = train_dataset_1.num_relations # extract the num_relation from the entire graph
sub_train_dataset_1 = Subset(train_dataset_1, list(range(config['train']['start_idx'],config['train']['end_idx'])))

train_loader_1 = DataLoader(
    sub_train_dataset_1,
    batch_size=config['train']['batch_size'],
    collate_fn=collate_fn,
    shuffle=True
)

In [66]:
config['train']['start_idx']

86

In [67]:
model_test8 = RGCNModel(
            node_dim=config['model']['in_channels'],
            question_dim=train_dataset_1.q_embeddings.size(-1),
            hidden_dim=config['model']['hidden_channels'],
            num_relations=num_relations_1,
            output_dim=config['model']['out_channels'],
            num_rgcn=config['model']['num_layers'],
            reduced_qn_dim=config['model']['reduced_qn_dim'],
            reduced_node_dim=config['model']['reduced_node_dim'],
            output_embedding=config['model']['output_embedding'],
            use_residuals=config['model']['use_residuals']
        )

checkpoint = torch.load(config['model_path_test8'])
model_test8.load_state_dict(checkpoint['model_state_dict'])
model_test8 = model_test8.to(device)

equal_subgraph_weighting = config['train']['equal_subgraph_weighting']
threshold_value = config['threshold_value']
hits_at_k = config['train']['hits_at_k']

save_all_path_test8 = config['save_all_path_test8']
save_emb_path_test8 = config['save_emb_path_test8']


  checkpoint = torch.load(config['model_path_test8'])


In [68]:
save_emb_path_test8

'/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/data/demo/subgraph_qembedding_test8.pt'

In [87]:
extract_subgraph_qemb(train_loader_1, 
                     model_test8,
                     device, 
                     threshold_value,
                     save_all_path_test8,
                     save_emb_path_test8)

Extracting subgraph: 100%|██████████| 1/1 [00:00<00:00, 28.02it/s]


In [17]:
model_test8.eval()

RGCNModel(
  (fc_reduce_qn): Linear(in_features=384, out_features=64, bias=True)
  (reduce_node_dim_layer): RGCNConv(384, 64, num_relations=9)
  (input_layer): RGCNConv(128, 64, num_relations=9)
  (rgcn_layers): ModuleList(
    (0): RGCNConv(64, 64, num_relations=9)
  )
  (output_layer): RGCNConv(64, 384, num_relations=9)
)

In [35]:
all_batched_subgraphs = []
all_question_embeddings = []
all_candidates_masks = []
all_similarity_scores = []
all_node_maps = []
all_labels = []
all_output_embeddings = []


with torch.no_grad():
    for (
        batched_subgraphs,
        question_embeddings,
        stacked_labels,
        node_maps,
        labels,
    ) in tqdm(train_loader_1, desc="Extracting subgraph", leave=True):
        batched_subgraphs = batched_subgraphs.to(device)
        question_embeddings = question_embeddings.to(device)
        stacked_labels = stacked_labels.to(device)
        node_maps = node_maps
        labels = labels

Extracting subgraph: 100%|██████████| 1/1 [00:00<00:00, 83.59it/s]


In [21]:
full_output = model_test8(batched_subgraphs, question_embeddings)

In [22]:
output = (full_output.output if hasattr(full_output, "output") else full_output)

In [24]:
with torch.no_grad():
    for (
        batched_subgraphs,
        question_embeddings,
        stacked_labels,
        node_maps,
        labels,
    ) in tqdm(train_loader_1, desc="Extracting subgraph", leave=True):
            batched_subgraphs = batched_subgraphs.to(device)
            question_embeddings = question_embeddings.to(device)
            stacked_labels = stacked_labels.to(device)
            
            print(batched_subgraphs)
            print(question_embeddings)
            br
            # Perform forward pass
            full_output = model(batched_subgraphs, question_embeddings)
            output = (
                full_output.output if hasattr(full_output, "output") else full_output
            )
            threshold = (
                full_output.threshold
                if hasattr(full_output, "threshold")
                else threshold_value
            )

            # Calculate similarity scores and candidates mask
            candidates_mask, similarity_score = threshold_based_candidates(
                output, threshold=threshold
            )
            output_embedding = (
                full_output.node_embedding
                if isinstance(full_output, Output)
                else full_output
            )
            print(candiates_mask, similarity_score)

Extracting subgraph:   0%|          | 0/1 [00:00<?, ?it/s]

DataBatch(edge_index=[2, 248], edge_attr=[248], x=[106, 384], batch=[106], ptr=[2])
tensor([[-3.2683e-02, -1.1771e-01,  2.0035e-02, -4.9907e-03, -7.1972e-02,
          3.1428e-02,  3.0334e-02,  3.4106e-02,  4.0418e-03,  3.7856e-03,
         -7.8110e-03,  2.4622e-02,  6.5852e-02, -1.8142e-02,  2.8830e-02,
          2.6791e-02,  7.9914e-02,  8.3346e-02,  1.0563e-01,  2.4452e-02,
         -2.0814e-02,  2.2093e-02, -1.4475e-02,  2.5309e-02,  5.3784e-03,
         -2.8438e-02,  5.9495e-02,  3.7550e-02,  1.2722e-02,  7.6392e-02,
         -2.0498e-02,  1.3107e-01, -3.6708e-02, -3.5629e-02,  4.9601e-03,
          3.7087e-02, -6.4176e-02, -4.6361e-02, -5.4901e-02, -3.0928e-02,
          7.8183e-03,  3.1838e-02,  1.5287e-02,  5.9942e-02,  3.6168e-03,
         -6.5001e-02, -4.7575e-02, -2.6547e-02, -6.5822e-03, -5.9673e-02,
         -1.4633e-02,  2.7034e-02,  2.7769e-02, -2.5018e-02, -3.8412e-02,
          9.1150e-03, -1.2804e-02, -4.1930e-03,  6.5810e-02, -2.5715e-02,
         -9.6672e-04, -2.341




NameError: name 'br' is not defined

In [25]:
threshold = (
                full_output.threshold
                if hasattr(full_output, "threshold")
                else threshold_value
            )

candidates_mask, similarity_score = threshold_based_candidates(
                output, threshold=threshold
            )

output_embedding = (
                full_output.node_embedding
                if isinstance(full_output, Output)
                else full_output
            )

In [None]:
all_batched_subgraphs.append(batched_subgraphs.x.detach().cpu())
all_question_embeddings.append(question_embeddings.detach().cpu())
all_candidates_masks.append(candidates_mask.detach().cpu())
all_node_maps.extend(node_maps)
all_labels.extend(labels)
all_output_embeddings.append(output_embedding.detach().cpu())

# Only append similarity scores if they are not None
if similarity_score is not None:
    all_similarity_scores.append(
        similarity_score.detach().cpu()
    )  # Ensures tensor format
else:
    print("Skipping batch with no similarity scores.")

In [36]:
# high_similarity_nodes = []
# for batch_idx, (mask, node_map) in enumerate(zip(candidates_mask, node_maps)):
#     # Get indices of high-similarity candidates
#     candidate_indices = torch.nonzero(mask, as_tuple=True)[0]  # Where mask is True

#     # Map subgraph indices to original graph indices using node_map
#     batch_high_similarity_nodes = [node_map[idx.item()] for idx in candidate_indices]
#     high_similarity_nodes.append(batch_high_similarity_nodes)


KeyError: 0

# Test 16

In [103]:
train_dataset_16 = KGQADataset(
    path_to_node_embed=config['node_embed'],
    path_to_idxes=config['idxes'],
    path_to_qa=config['train_qa_data'],
    path_to_kb=config['raw_kb'],
    from_paths_activate=False,
    entity_sbert=False,
    k=config['num_hops']
)

num_relations_16 = train_dataset_16.num_relations # extract the num_relation from the entire graph
sub_train_dataset_16 = Subset(train_dataset_16, list(range(config['train']['start_idx'],config['train']['end_idx'])))

train_loader_16 = DataLoader(
    sub_train_dataset_16,
    batch_size=config['train']['batch_size'],
    collate_fn=collate_fn,
    shuffle=True
)

In [104]:
model_test16 = RGCNModel(
            node_dim=64,
            question_dim=train_dataset_16.q_embeddings.size(-1),
            hidden_dim=64,
            num_relations=num_relations_16,
            output_dim=config['model']['out_channels'],
            num_rgcn=config['model']['num_layers'],
            reduced_qn_dim=config['model']['reduced_qn_dim'],
            reduced_node_dim=config['model']['reduced_node_dim'],
            output_embedding=config['model']['output_embedding'],
            use_residuals=config['model']['use_residuals']
        )

checkpoint = torch.load('/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/src/checkpoints/Test_16/best_model_epoch_32.pth')
model_test16.load_state_dict(checkpoint['model_state_dict'])
model_test16 = model_test16.to(device)

equal_subgraph_weighting = True
threshold_value = 0.5
hits_at_k = 3

save_all_path_test16 = '/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/data/demo/candidate_metadata_test16.pt'
save_emb_path_test16 = '/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/data/demo/subgraph_qembedding_test16.pt'


  checkpoint = torch.load('/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/src/checkpoints/Test_16/best_model_epoch_32.pth')


In [105]:
save_all_path_test16

'/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/data/demo/candidate_metadata_test16.pt'

In [106]:
save_emb_path_test8

'/hpctmp/e0315913/demo/CS5284_Project/GNN-cluster/data/demo/subgraph_qembedding_test16.pt'

In [107]:
extract_subgraph_qemb(train_loader_16, 
                     model_test16,
                     device, 
                     threshold_value,
                     save_all_path_test16,
                     save_emb_path_test16)

Extracting subgraph: 100%|██████████| 1/1 [00:00<00:00, 37.00it/s]
