In [1]:
# Cython Speed up
import pyximport

pyximport.install(pyimport=True, language_level="3")


(<pyximport._pyximport3.PyImportMetaFinder at 0x7f9056223400>,
 <pyximport._pyximport3.PyxImportMetaFinder at 0x7f90562234c0>)

# Import Entities and Relations

In [2]:
import pandas as pd
import networkx as nx

path = '../Datasets/MetaQA_dataset/'

df = pd.read_csv(path+'kb.txt', sep='|', header=None, names=['entity1', 'relation', 'entity2'])
df.head()

Unnamed: 0,entity1,relation,entity2
0,Kismet,directed_by,William Dieterle
1,Kismet,written_by,Edward Knoblock
2,Kismet,starred_actors,Marlene Dietrich
3,Kismet,starred_actors,Edward Arnold
4,Kismet,starred_actors,Ronald Colman


# Create graph

In [3]:
import networkx as nx
from torch_geometric.utils import from_networkx

# Create a NetworkX graph
G = nx.Graph()

# Iterate over the DataFrame and add edges
for idx, row in df.iterrows():
    entity1 = row['entity1']
    entity2 = row['entity2']
    relation = row['relation']

    # Add edge with relation as edge attribute
    G.add_edge(entity1, entity2, relation=relation)

# Convert to a PyTorch Geometric Data object
data = from_networkx(G)

# Print the PyTorch Geometric Data object
print(data)

# Print first N edges to verify
N = 2
edges = list(G.edges(data=True))[:N]
print(edges)

Data(edge_index=[2, 249349], relation=[249349], num_nodes=43234)
[('Kismet', 'William Dieterle', {'relation': 'directed_by'}), ('Kismet', 'Edward Knoblock', {'relation': 'written_by'})]


# Import node embeddings

In [4]:
import torch
import torch_geometric
from torch_geometric.data import Data

# Number of nodes (entities)
num_nodes = G.number_of_nodes()

In [5]:
import csv
import pandas as pd

file_path = '../MetaQA/ud_node2vec_embeddings.txt'  # Replace with your file path

# Define lists to store the node IDs and their embeddings
node_ids = []
embeddings = []

# Open and read the file
with open(file_path, 'r') as file:
    reader = csv.reader(file, delimiter=' ')
    for line_num, row in enumerate(reader):
        # Store everything except the last 64 elements as the "node ID"
        node_ids.append(' '.join(row[:-64]))  # Join multiple elements if they exist before the last 64

        # Store the last 64 elements as the "embeddings"
        embeddings.append([float(x) for x in row[-64:]])  # Convert embeddings to floats

node_ids.pop(0)
embeddings.pop(0)
embeddings = torch.tensor(embeddings, dtype=torch.float)

print(node_ids[0:5])
print(type(embeddings))  # Check the type
print(embeddings.shape)  # Check the embedding dimensions

['Drama', 'Comedy', 'Horror', 'Thriller', 'bd-r']
<class 'torch.Tensor'>
torch.Size([43234, 64])


# Map node embeddings to graph

In [6]:
# Create a mapping from node ID to index in the node embedding tensor
node_to_index = {node: idx for idx, node in enumerate(node_ids)}

# Visualize the first n items
print(list(node_to_index.items())[0:5])

# Ensure that the node order in the graph matches the order in the embeddings
graph_node_order = list(G.nodes())  # Order of nodes in the graph
print(f"graph_node_order:{graph_node_order[0:5]}")

graph_node_embeddings = torch.zeros((len(graph_node_order), embeddings.shape[1]))


for idx, node in enumerate(graph_node_order):
    if node in node_to_index:
        graph_node_embeddings[idx] = embeddings[node_to_index[node]]
    else:
        print(f"Warning: Node {node} not found in embedding file.")

# Assign these embeddings as the node features (x) in the PyTorch Geometric Data object
data.x = graph_node_embeddings


[('Drama', 0), ('Comedy', 1), ('Horror', 2), ('Thriller', 3), ('bd-r', 4)]
graph_node_order:['Kismet', 'William Dieterle', 'Edward Knoblock', 'Marlene Dietrich', 'Edward Arnold']


# Import QA training datasetS

In [7]:
import pandas as pd

# Load the question-answer pairs from qa_train.txt
qa_file = '../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train.txt'
qa_df = pd.read_csv(qa_file, sep='\t', header=None, names=['question', 'answers'])

# Load the relation types from qa_train_qtype.txt
qa_qtype_file = '../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train_qtype.txt'
qa_qtype_df = pd.read_csv(qa_qtype_file, sep='\t', header=None, names=['relation_type'])

# Ensure both files are aligned
print(qa_df.shape, qa_qtype_df.shape)  # They should have the same number of rows


(114196, 2) (114196, 1)


In [8]:
# Combine the two DataFrames
qa_combined_df = pd.concat([qa_df, qa_qtype_df], axis=1)

# Check the combined DataFrame
qa_combined_df.head()

Unnamed: 0,question,answers,relation_type
0,the films that share actors with the film [Dil...,1997|1998|2003|2001|2006|2004|2005|2014|2008|2...,movie_to_actor_to_movie_to_year
1,who are the directors of the movies written by...,Stephen King|Frank Darabont|Tobe Hooper|Mick G...,movie_to_writer_to_movie_to_director
2,which person directed the films acted by the a...,Chris D'Arienzo|Jonathan Kesselman|Joe Chappel...,movie_to_actor_to_movie_to_director
3,who is listed as director of the movies starre...,Chris Columbus|David Yates|Alfonso Cuarón|Mike...,movie_to_actor_to_movie_to_director
4,what types are the films directed by the direc...,Action|Comedy|Western|Thriller|Crime,movie_to_director_to_movie_to_genre


In [9]:
import re
import pandas as pd
import torch
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import time

# Regular expression to extract entities enclosed in square brackets
entity_pattern = re.compile(r'\[(.*?)\]')

# Store question-related data for training the GNN
question_data = []
missing_entities = []  # Track missing entities

# Function to process each question
def process_question(row):
    question = row['question']
    answers = row['answers'].split('|')  # List of correct answers
    relation_steps = row['relation_type']  # The relation type from qa_train_qtype.txt

    # Extract entities enclosed in square brackets and normalize
    entities = entity_pattern.findall(question)

    if not entities:
        return None  # Skip if no entity found

    start_entity = entities[0]
    # start_entity = entities[0].strip().lower()  # Normalize start entity

    # Find the index of the start entity in the graph
    if start_entity not in graph_node_order:
        print(f"Warning: Start entity '{start_entity}' not found in graph.")
        missing_entities.append(start_entity)  # Track missing entity
        return None

    start_entity_idx = graph_node_order.index(start_entity)

    # Find the indices of the answer nodes and normalize answers
    answer_indices = []
    for answer in answers:
        # answer = answer.strip().lower()  # Normalize answer
        if answer in graph_node_order:
            answer_idx = graph_node_order.index(answer)
            answer_indices.append(answer_idx)
        else:
            print(f"Warning: Answer '{answer}' not found in graph.")
            missing_entities.append(answer)  # Track missing answer entity

    # Return the start entity index, relation steps, and answer indices for this question
    return {
        'start_entity_idx': start_entity_idx,
        'relation_steps': relation_steps,  # Now using the split relation steps directly
        'answer_indices': answer_indices
    }

# Process the data in parallel
max_workers = os.cpu_count()  # Number of available CPU cores
start_time = time.time()  # Track start time

# Collect results safely
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_question, row) for idx, row in qa_combined_df.iterrows()]

    for future in as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

# Append all results at once
question_data.extend(results)

# Now 'question_data' contains the necessary info for training
# Each entry contains the start entity index, relation steps, and the list of answer indices

# Visualize an entry
print(question_data[0])

end_time = time.time()  # Track end time
elapsed_time = end_time - start_time  # Print the time taken
print(f"Time taken: {elapsed_time:.2f} seconds")

# Optionally: print missing entities
print(f"Missing entities: {missing_entities}")


{'start_entity_idx': 5123, 'relation_steps': 'movie_to_director_to_movie_to_genre', 'answer_indices': [46, 157, 119, 52, 618]}
Time taken: 261.49 seconds
Missing entities: []


# Define GNN structure

In [10]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNModel(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define training regime

In [11]:
import torch

# Check if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model = GNNModel(in_channels=64, out_channels=2).to(device)

# Move the data to the GPU
data = data.to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [12]:
# Define the mapping in Python
relation_mapping = {
    'movie_to_actor_to_movie_to_year': ['starred_actors', 'starred_actors', 'release_year'],
    'movie_to_writer_to_movie_to_director': ['written_by', 'written_by', 'directed_by'],
    'movie_to_actor_to_movie_to_director': ['starred_actors', 'starred_actors', 'directed_by'],
    'movie_to_director_to_movie_to_genre': ['directed_by', 'directed_by', 'has_genre'],
    'movie_to_actor_to_movie_to_writer': ['starred_actors', 'starred_actors', 'written_by'],
    'movie_to_director_to_movie_to_language': ['directed_by', 'directed_by', 'in_language'],
    'movie_to_writer_to_movie_to_actor': ['written_by', 'written_by', 'starred_actors'],
    'movie_to_director_to_movie_to_actor': ['directed_by', 'directed_by', 'starred_actors'],
    'movie_to_actor_to_movie_to_language': ['starred_actors', 'starred_actors', 'in_language'],
    'movie_to_director_to_movie_to_year': ['directed_by', 'directed_by', 'release_year'],
    'movie_to_actor_to_movie_to_genre': ['starred_actors', 'starred_actors', 'has_genre'],
    'movie_to_director_to_movie_to_writer': ['directed_by', 'directed_by', 'written_by'],
    'movie_to_writer_to_movie_to_genre': ['written_by', 'written_by', 'has_genre'],
    'movie_to_writer_to_movie_to_year': ['written_by', 'written_by', 'release_year'],
    'movie_to_writer_to_movie_to_language': ['written_by', 'written_by', 'in_language']
}

In [13]:
def translate_relation_steps(relation_steps, relation_mapping):
    # Translate the entire relation string into its specific edge types
    return relation_mapping.get(relation_steps, relation_steps)  # Use mapping, or return the original if not found


### Troubleshooting (start)

In [14]:
# G.nodes

In [15]:
# q = question_data[0]

# start_entity_idx = q['start_entity_idx']
# relation_steps2 = q['relation_steps']  # Extract the parsed relation steps
# answer_indices = q['answer_indices']

# # Translate relation steps to actual edge types in the graph
# relation_steps = translate_relation_steps(relation_steps2, relation_mapping)

# # Get the start entity name from the global node list (graph_node_order)
# start_entity = graph_node_order[start_entity_idx]

# # Get the neighbors within 'hops' distance from the start entity
# sub_nodes = nx.single_source_shortest_path_length(G, start_entity, cutoff=3).keys()

# # Extract the subgraph from the main graph
# subgraph = G.subgraph(sub_nodes).copy()

# # Filter edges based on each step of the translated relation type
# filtered_edges = []
# for u, v, d in subgraph.edges(data=True):
#     # Check if the relation exists in the edge data and matches the translated steps
#     if 'relation' in d and d['relation'] in relation_steps:
#         filtered_edges.append((u, v))

# # Create a new filtered subgraph with only the relevant edges
# subgraph_filtered = subgraph.edge_subgraph(filtered_edges).copy()

# # Convert the filtered subgraph to a PyTorch Geometric Data object
# data_subgraph = from_networkx(subgraph_filtered)

# # Store the global node IDs as a list instead of
# data_subgraph.global_node_id = list(subgraph_filtered.nodes)

# subgraph_node_indices = [graph_node_order.index(node) for node in subgraph_filtered.nodes]

# # Extract and assign the node features (embeddings) from the full graph's data.x
# data_subgraph.x = data.x[subgraph_node_indices].to(device)  # Use the corresponding node embeddings from the full graph
# subgraph_data = data_subgraph

In [16]:
# for answer_idx in answer_indices:
#     # Map global answer_idx to the local subgraph node
#     if answer_idx in subgraph_data.global_node_id:
#         local_answer_idx = subgraph_data.global_node_id.index(answer_idx)  # Get the local index
#         subgraph_data.y[local_answer_idx] = 1

In [17]:
# subgraph_node_indices

In [18]:
# subgraph_filtered.nodes

In [19]:
# subgraph_node_indices

### Troubleshooting (end)

In [20]:
# def get_subgraph_with_relation_steps(start_entity_idx, G, data, relation_steps, hops=3):
#     # Translate relation steps to actual edge types in the graph
#     relation_steps = translate_relation_steps(relation_steps, relation_mapping)

#     # Get the start entity name from the global node list (graph_node_order)
#     start_entity = graph_node_order[start_entity_idx]

#     # Get the neighbors within 'hops' distance from the start entity
#     sub_nodes = nx.single_source_shortest_path_length(G, start_entity, cutoff=hops).keys()

#     # Extract the subgraph from the main graph
#     subgraph = G.subgraph(sub_nodes).copy()

#     # Filter edges based on each step of the translated relation type
#     filtered_edges = []
#     for u, v, d in subgraph.edges(data=True):
#         # Check if the relation exists in the edge data and matches the translated steps
#         if 'relation' in d and d['relation'] in relation_steps:
#             filtered_edges.append((u, v))

#     # Create a new filtered subgraph with only the relevant edges
#     subgraph_filtered = subgraph.edge_subgraph(filtered_edges).copy()

#     # Convert the filtered subgraph to a PyTorch Geometric Data object
#     data_subgraph = from_networkx(subgraph_filtered)

#     # Store the global node IDs as a list instead of a tensor
#     data_subgraph.global_node_id = list(subgraph_filtered.nodes)

#     # Ensure that the node order in the subgraph matches the embeddings in the original graph
#     subgraph_node_indices = [graph_node_order.index(node) for node in subgraph_filtered.nodes]

#     # Extract and assign the node features (embeddings) from the full graph's data.x
#     data_subgraph.x = data.x[subgraph_node_indices].to(device)  # Use the corresponding node embeddings from the full graph

#     return data_subgraph

In [21]:
def get_subgraph_with_relation_steps(start_entity_idx, G, data, relation_steps, hops=3):
    # Translate relation steps to actual edge types in the graph
    relation_steps = translate_relation_steps(relation_steps, relation_mapping)

    # Get the start entity name from the global node list (graph_node_order)
    start_entity = graph_node_order[start_entity_idx]

    # Get the neighbors within 'hops' distance from the start entity
    sub_nodes = nx.single_source_shortest_path_length(G, start_entity, cutoff=hops).keys()

    # Extract the subgraph from the main graph
    subgraph = G.subgraph(sub_nodes).copy()

    # Filter edges based on each step of the translated relation type
    filtered_edges = []
    for u, v, d in subgraph.edges(data=True):
        # Check if the relation exists in the edge data and matches the translated steps
        if 'relation' in d and d['relation'] in relation_steps:
            filtered_edges.append((u, v))

    # Create a new filtered subgraph with only the relevant edges
    subgraph_filtered = subgraph.edge_subgraph(filtered_edges).copy()

    # Convert the filtered subgraph to a PyTorch Geometric Data object
    data_subgraph = from_networkx(subgraph_filtered)

    # Ensure that the node order in the subgraph matches the embeddings in the original graph
    subgraph_node_indices = [graph_node_order.index(node) for node in subgraph_filtered.nodes]

    # Store the global node indices (which are integers, not strings) in global_node_id
    data_subgraph.global_node_id = subgraph_node_indices

    # Extract and assign the node features (embeddings) from the full graph's data.x
    data_subgraph.x = data.x[subgraph_node_indices].to(device)  # Use the corresponding node embeddings from the full graph

    return data_subgraph


In [22]:
# def train_gnn_with_subgraph(question_data, data, G, use_full_graph=False):
#     model.train()
#     total_loss = 0
#     missing_answer = 0
#     c = 0

#     # Ensure the full graph also has global_node_id if using full graph
#     if use_full_graph and not hasattr(data, 'global_node_id'):
#         # Create global_node_id for the full graph, which is the full list of node indices
#         graph_node_indices = [graph_node_order.index(node) for node in G.nodes]
#         data.global_node_id = graph_node_indices


#     for q in question_data:
#         c += 1

#         start_entity_idx = q['start_entity_idx']
#         relation_steps = q['relation_steps']  # Extract the parsed relation steps
#         answer_indices = q['answer_indices']

#         if use_full_graph:
#             # Use the entire graph for training
#             subgraph_data = data  # Use the full graph's data
#         else:
#             # Get the subgraph centered around the start entity and filtered by relation steps
#             subgraph_data = get_subgraph_with_relation_steps(start_entity_idx, G, data, relation_steps, hops=20)

#         # Move the graph data (either full or subgraph) to the correct device (e.g., GPU)
#         subgraph_data = subgraph_data.to(device)

#         optimizer.zero_grad()

#         # Forward pass through the GNN model (on the full graph or subgraph)
#         out = model(subgraph_data)  # Model predicts scores for all nodes in the (sub)graph

#         # Initialize subgraph_data.y (labels) to be all zeros (for the subgraph or full graph nodes)
#         subgraph_data.y = torch.zeros(subgraph_data.num_nodes, dtype=torch.long).to(device)

#         # Mark the correct answer nodes in the graph as 1
#         for answer_idx in answer_indices:
#             # Map global answer_idx to the local subgraph node
#             if answer_idx in subgraph_data.global_node_id:
#                 local_answer_idx = subgraph_data.global_node_id.index(answer_idx)  # Get the local index
#                 subgraph_data.y[local_answer_idx] = 1
#             else:
#                 missing_answer += 1
#                 print(f"Warning: Answer node {answer_idx} not found in the (sub)graph; counter: {missing_answer}; question_data: {c}")

#         # Compute the loss (CrossEntropy between the predicted outputs and actual labels)
#         loss = criterion(out, subgraph_data.y)
#         loss.backward()  # Backpropagate the gradients
#         optimizer.step()  # Update model parameters

#         total_loss += loss.item()

#     return total_loss


In [30]:
# (A)

from tqdm import tqdm

def train_gnn_with_subgraph(question_data, data, G, use_full_graph=False):
    model.train()
    total_loss = 0
    missing_answer = 0
    c = 0

    # Precompute global node IDs as a dictionary for fast lookups
    global_node_id_map = {node: idx for idx, node in enumerate(graph_node_order)}

    # Ensure the full graph also has global_node_id if using full graph
    if use_full_graph and not hasattr(data, 'global_node_id'):
        # Assign global_node_id for the full graph
        data.global_node_id = [global_node_id_map[node] for node in G.nodes]

    # Wrap question_data in tqdm to display a progress bar
    for q in tqdm(question_data, desc="Training Progress"):
        c += 1

        start_entity_idx = q['start_entity_idx']
        relation_steps = q['relation_steps']  # Extract the parsed relation steps
        answer_indices = q['answer_indices']

        if use_full_graph:
            # Use the entire graph for training
            subgraph_data = data  # Use the full graph's data
        else:
            # Get the subgraph centered around the start entity and filtered by relation steps
            subgraph_data = get_subgraph_with_relation_steps(start_entity_idx, G, data, relation_steps, hops=3)

        # Move the graph data (either full or subgraph) to the correct device (e.g., GPU)
        subgraph_data = subgraph_data.to(device)

        optimizer.zero_grad()

        # Forward pass through the GNN model (on the full graph or subgraph)
        out = model(subgraph_data)  # Model predicts scores for all nodes in the (sub)graph

        # Pre-initialize labels as zeros (subgraph_data.y) and map global to local indices
        subgraph_data.y = torch.zeros(subgraph_data.num_nodes, dtype=torch.long).to(device)
        global_to_local_idx = {global_id: local_id for local_id, global_id in enumerate(subgraph_data.global_node_id)}

        # Mark the correct answer nodes in the graph as 1
        for answer_idx in answer_indices:
            # Map global answer_idx to the local subgraph node using the precomputed global_to_local_idx
            local_answer_idx = global_to_local_idx.get(answer_idx)
            if local_answer_idx is not None:
                subgraph_data.y[local_answer_idx] = 1
            else:
                missing_answer += 1
                print(f"Warning: Answer node {answer_idx} not found in the (sub)graph; counter: {missing_answer}; question_data: {c}")

        # Compute the loss (CrossEntropy between the predicted outputs and actual labels)
        loss = criterion(out, subgraph_data.y)
        loss.backward()  # Backpropagate the gradients
        optimizer.step()  # Update model parameters

        total_loss += loss.item()

    return total_loss


In [32]:
from tqdm import tqdm

def train_gnn(question_data, data):
    model.train()

    total_loss = 0
    for q in tqdm(question_data, desc="Training Progress"):
        start_entity_idx = q['start_entity_idx']
        answer_indices = q['answer_indices']


        optimizer.zero_grad()

        # Forward pass through the GNN model
        out = model(data)  # out is a prediction for each node

        # Create the target labels (1 for answer nodes, 0 for others)
        labels = torch.zeros(data.num_nodes, dtype=torch.long).to(device)
        for answer_idx in answer_indices:
            labels[answer_idx] = 1

        # Compute the loss (CrossEntropy between predictions and labels)
        loss = criterion(out, labels)
        loss.backward()  # Backpropagate
        optimizer.step()  # Update model parameters

        total_loss += loss.item()

    return total_loss

# Train

In [33]:
# (A) Very slow

# from tqdm import tqdm  # For progress bars

# start_time = time.time()

# # Use tqdm to display progress bar
# # for epoch in tqdm(range(1), desc="Training Progress"):
# for epoch in range(1):
#     loss = train_gnn_with_subgraph(question_data, data, G, use_full_graph=False)
#     if epoch % 10 == 0:
#         print(f'Epoch {epoch}, Loss: {loss}')

# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Time taken: {elapsed_time:.2f} seconds")


In [34]:
for epoch in range(1):
    loss = train_gnn(question_data, data)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')


Training Progress: 100%|██████████| 114196/114196 [07:02<00:00, 270.00it/s]

Epoch 0, Loss: 261.0352136149886





# Predict

In [35]:
def predict_answer(model, data, start_entity_name):
    # Convert the start entity name to its index in the graph
    if start_entity_name not in node_to_index:
        print(f"Entity '{start_entity_name}' not found in the graph.")
        return None

    start_entity_idx = node_to_index[start_entity_name]

    # Forward pass through the model (entire graph) on the correct device
    with torch.no_grad():  # Disable gradient computation for inference
        out = model(data)  # Model predicts scores for all nodes (shape: [num_nodes, 2])

    # Ensure the output is on the correct device
    out = out.to(device)

    # Get neighbors of the start entity
    neighbors = list(G.neighbors(start_entity_name))
    neighbors_idx = [node_to_index[neighbor] for neighbor in neighbors if neighbor in node_to_index]

    # Create a mask: 1 for neighbor nodes, 0 otherwise
    mask = torch.zeros(out.shape[0], dtype=torch.bool).to(device)  # Ensure the mask is on the same device
    if neighbors_idx:  # Check if neighbors exist
        mask[neighbors_idx] = 1  # Mark neighbors of the start entity

    # Expand the mask to be broadcasted across the second dimension (class dimension)
    mask = mask.unsqueeze(1)  # Shape: [num_nodes, 1], which will broadcast to [num_nodes, 2]

    # Apply the mask to the output
    masked_out = out * mask  # Shape: [num_nodes, 2]

    # Apply torch.argmax to find the node with the highest score for class 1 (the "correct" class)
    predicted_answer_idx = torch.argmax(masked_out[:, 1])  # Pick the 1 class (assuming binary classification)

    # Convert the predicted index back to the node name (entity name)
    predicted_answer = graph_node_order[predicted_answer_idx.item()]  # Convert tensor to integer index and map to name

    return predicted_answer


In [36]:
predict_answer(model, data, 'Kismet') # ...

'Kismet'

# Part 2

# Load and setup dataset

- pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
<!-- - pip3 install torch torchvision torchaudio -->
- pip install torch-scatter
  - https://github.com/rusty1s/pytorch_scatter/issues/424
- pip install torch-geometric 
- pip install pandas
- pip install sentence-transformers
- pip install ipywidgets
- 

In [1]:
import importlib.util
import os

file_path = os.path.abspath('../data_preparation/functions_modified.py')

# Load the module dynamically
spec = importlib.util.spec_from_file_location("functions", file_path)
functions = importlib.util.module_from_spec(spec)
spec.loader.exec_module(functions)

In [2]:
path_to_node_embed = '../Datasets/MetaQA_dataset/processed/node2vec _embeddings/ud_node2vec_embeddings.txt'
path_to_idxes = '../Datasets/MetaQA_dataset/processed/idxes.json'
path_to_qa = '../Datasets/MetaQA_dataset/vanilla 3-hop/qa_train.txt'
data = functions.KGQADataset(path_to_node_embed, path_to_idxes, path_to_qa)

In [3]:
from torch.utils.data import DataLoader

# data_loader = DataLoader(data, batch_size=16, collate_fn=functions.collate_fn, shuffle=True)
# data_loader = DataLoader(data, batch_size=16, collate_fn=functions.collate_fn, shuffle=True)

# Define GNN

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNModel(nn.Module):
    def __init__(self, in_channels, question_embedding_dim, out_channels):
        super(GNNModel, self).__init__()
        PROC_QN_EMBED_DIM = 64  # Reduced question embedding dimension (keep it small)
        PROC_X_DIM = 64          # Reduced node embedding dimension (keep it small)

        # GCN layers for node embeddings
        self.conv1 = GCNConv(in_channels, 32)  # First GCN layer
        self.conv2 = GCNConv(32, PROC_X_DIM)   # Second GCN layer, reduces node embedding to 4

        # Fully connected layer for reducing question embeddings
        self.fc0 = nn.Linear(question_embedding_dim, PROC_QN_EMBED_DIM)

        # Fully connected layer applied to each node embedding (after concatenating with question embedding)
        self.fc1 = nn.Linear(PROC_X_DIM + PROC_QN_EMBED_DIM, 32)  # FCL for each node
        self.fc2 = nn.Linear(32, out_channels)  # Final output (binary classification per node)

    def forward(self, data, question_embedding):
        # Graph propagation through GCN layers
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        # input(f"x.shape: {x.shape}")

        # Reduce the question embedding
        # input(f"before reduction question_embedding.shape: {question_embedding.shape}")
        question_embedding = F.relu(self.fc0(question_embedding))  # Shape: (batch_size, PROC_QN_EMBED_DIM)
        # input(f"After reduction question_embedding.shape: {question_embedding.shape}")

        # Broadcast question embedding to match each node in the batch
        question_embedding_expanded = question_embedding[batch]  # Shape: (num_nodes_total, PROC_QN_EMBED_DIM)
        # input(f"After broacast question_embedding.shape: {question_embedding_expanded.shape}")

        # Concatenate node embeddings with question embeddings
        combined = torch.cat([x, question_embedding_expanded], dim=1)  # Shape: (num_nodes_total, PROC_X_DIM + PROC_QN_EMBED_DIM)
        # input(f"combined.shape: {combined.shape}")

        # Apply FCL node-wise (same FCL for each node, which outputs per node predictions)
        x = F.relu(self.fc1(combined))  # Shape: (num_nodes_total, 8)
        x = self.fc2(x)                 # Shape: (num_nodes_total, out_channels)
        # input(f"output.shape: {x.shape}")

        # Return node-wise predictions (logits for each node)
        return x


# Define training loop

In [23]:
sub_data1 = torch.utils.data.Subset(data, list(range(1280)))
dataloader_train = DataLoader(sub_data1, batch_size=4, collate_fn=functions.collate_fn, shuffle=True)

In [69]:
from tqdm import tqdm
progress_bar = tqdm(dataloader_train)
pos_cum = 0
neg_cum = 0
neg_pos = []
for b in progress_bar:
    _, _, labels, _ = b
    pos = sum(labels)
    neg = len(labels) - pos
    pos_cum += pos
    neg_cum += neg
    neg_pos.append(neg/pos)

100%|██████████| 320/320 [02:19<00:00,  2.29it/s]


In [114]:
print(pos_cum, neg_cum, neg_cum/pos_cum)
print(torch.tensor([sum(neg_pos)/len(neg_pos)],device=device))
print(torch.tensor([max(neg_pos)],device=device))

tensor([16301]) tensor([14497185]) tensor([889.3433])
tensor([1485.6534], device='cuda:0')
tensor([9080.1670], device='cuda:0')


In [102]:
import torch.optim as optim
from tqdm import tqdm
import torch_scatter

# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the GNN model
in_channels = 64  # Node embedding dimension (e.g., node2vec embeddings)
question_embedding_dim = 384  # Assuming 32-dimensional question embeddings
out_channels = 1  # Binary classification (answer or not)
model = GNNModel(in_channels, question_embedding_dim, out_channels).to(device)

# Calculate the number of positive and negative samples for class weighting
num_positive = pos_cum  # Set the number of positive examples in your dataset
num_negative = neg_cum  # Set the number of negative examples in your dataset
factor = 10^2
# Set the positive weight (to balance the class distribution)
# The higher the imbalance, the larger this value will be
# pos_weight = torch.tensor([num_negative / num_positive], device=device)
pos_weight = torch.tensor([factor*sum(neg_pos)/len(neg_pos)],device=device)

# Set up optimizer and weighted loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='none')

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    # Add tqdm for batches within the epoch
    progress_bar = tqdm(dataloader_train, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        subgraph_data, question_embeddings, labels, _ = batch

        # Move data to the correct device
        subgraph_data = subgraph_data.to(device)
        question_embeddings = question_embeddings.to(device)
        labels = labels.to(device)

        # Forward pass
        out = model(subgraph_data, question_embeddings)

        # Compute the number of nodes per subgraph
        num_subgraphs = len(subgraph_data.batch.unique())
        nodes_per_subgraph = subgraph_data.batch.bincount(minlength=num_subgraphs).float()  # Shape: [num_subgraphs]

        # Compute weighted loss for each node
        loss = criterion(out, labels.float())

        # Sum the losses for all nodes in each subgraph
        loss_per_subgraph = torch_scatter.scatter_add(loss, subgraph_data.batch, dim=0)

        # Normalize the loss per subgraph by the number of nodes in each subgraph
        normalized_loss_per_subgraph = loss_per_subgraph / nodes_per_subgraph

        # Take the mean of normalized losses for the batch
        batch_loss = normalized_loss_per_subgraph.mean()

        # Backward pass and optimization
        optimizer.zero_grad()
        batch_loss.backward()  # Backpropagate
        optimizer.step()  # Optimization step

        total_loss += batch_loss.detach().item()

        # Update tqdm with the current loss
        progress_bar.set_postfix(loss=total_loss)

    # Optionally clear CUDA memory cache after each epoch
    torch.cuda.empty_cache()

    # Print total loss per epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {total_loss:.4f}")


Epoch 1/1: 100%|██████████| 320/320 [01:54<00:00,  2.79it/s, loss=5.75e+3]

Epoch 1/1, Total Loss: 5752.3131





In [117]:
import torch_scatter
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()  # Set the model to evaluation mode
batch_accuracies = []  # List to store accuracy of each question/subgraph
batch_precisions = []  # List to store precision of each subgraph
batch_recalls = []  # List to store recall of each subgraph
batch_f1s = []  # List to store F1-score of each subgraph
pred_list = []

# Add tqdm for batches within the epoch
progress_bar = tqdm(dataloader_train, desc="Training")

with torch.no_grad():
    for batch in progress_bar:
        subgraph_data, question_embeddings, labels, _ = batch  # Assuming labels are in the batch

        # Move data to the device
        subgraph_data = subgraph_data.to(device)
        question_embeddings = question_embeddings.to(device)
        labels = labels.to(device)  # Move the ground truth labels to the device

        # Forward pass
        out = model(subgraph_data, question_embeddings)

        # Detach the predicted labels
        predicted = torch.argmax(out, dim=1).detach()  # Detach to avoid gradient tracking

        # Keep everything on the same device (GPU)
        predicted_flat = predicted.view(-1)
        labels_flat = labels.view(-1)
        pred_list.append(sum(predicted_flat))

        # Compute accuracy for all nodes (1 for correct predictions, 0 for incorrect)
        correct_predictions = (predicted_flat == labels_flat).int()

        # Use subgraph_data.batch to group predictions by subgraph (make sure it's on the same device)
        num_subgraphs = len(subgraph_data.batch.unique())
        nodes_per_subgraph = subgraph_data.batch.bincount(minlength=num_subgraphs)  # Count nodes in each subgraph

        # Sum correct predictions per subgraph
        correct_per_subgraph = torch_scatter.scatter_add(correct_predictions, subgraph_data.batch, dim=0)

        # Calculate accuracy per subgraph
        subgraph_accuracies = correct_per_subgraph / nodes_per_subgraph

        # Append accuracies for each subgraph in the batch
        batch_accuracies.extend(subgraph_accuracies.cpu().tolist())  # Move the results to CPU

        # Compute precision, recall, and F1-score per subgraph
        for i in range(num_subgraphs):
            node_mask = (subgraph_data.batch == i)  # Get the mask for the current subgraph

            # Get the true labels and predictions for this subgraph
            labels_subgraph = labels_flat[node_mask].cpu().numpy()  # Convert to numpy for sklearn
            predicted_subgraph = predicted_flat[node_mask].cpu().numpy()

            if len(set(labels_subgraph)) > 1:  # Avoid cases where only one class exists
                precision = precision_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                recall = recall_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                f1 = f1_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
            else:
                # Handle the case when all labels are the same (sklearn can raise a warning or return 0)
                precision, recall, f1 = 0.0, 0.0, 0.0

            # Store the precision, recall, and F1-score
            batch_precisions.append(precision)
            batch_recalls.append(recall)
            batch_f1s.append(f1)

# Compute the average accuracy, precision, recall, and F1-score over all subgraphs in the batch
average_accuracy = sum(batch_accuracies) / len(batch_accuracies)
average_precision = sum(batch_precisions) / len(batch_precisions)
average_recall = sum(batch_recalls) / len(batch_recalls)
average_f1 = sum(batch_f1s) / len(batch_f1s)

print(f"Average Accuracy: {average_accuracy:.8f}")
print(f"Average Precision: {average_precision:.8f}")
print(f"Average Recall: {average_recall:.8f}")
print(f"Average F1-Score: {average_f1:.8f}")


Training: 100%|██████████| 320/320 [02:30<00:00,  2.13it/s]

Average Accuracy: 0.99730150
Average Precision: 0.00000000
Average Recall: 0.00000000
Average F1-Score: 0.00000000





In [123]:
# pred_list
p = 0
pb = tqdm(pred_list)
for l in pb:
    p += sum(l)

100%|██████████| 320/320 [03:23<00:00,  1.57it/s]


In [124]:
p

tensor(0, device='cuda:0')

In [84]:
sub_data2 = torch.utils.data.Subset(data, list(range(1281, 1600)))
dataloader_val = DataLoader(sub_data2, batch_size=16, collate_fn=functions.collate_fn, shuffle=True)

In [85]:
import torch_scatter
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()  # Set the model to evaluation mode
batch_accuracies = []  # List to store accuracy of each question/subgraph
batch_precisions = []  # List to store precision of each subgraph
batch_recalls = []  # List to store recall of each subgraph
batch_f1s = []  # List to store F1-score of each subgraph

# Add tqdm for batches within the epoch
progress_bar = tqdm(dataloader_val, desc="Validation")

with torch.no_grad():
    for batch in progress_bar:
        subgraph_data, question_embeddings, labels, _ = batch  # Assuming labels are in the batch

        # Move data to the device
        subgraph_data = subgraph_data.to(device)
        question_embeddings = question_embeddings.to(device)
        labels = labels.to(device)  # Move the ground truth labels to the device

        # Forward pass
        out = model(subgraph_data, question_embeddings)

        # Detach the predicted labels
        predicted = torch.argmax(out, dim=1).detach()  # Detach to avoid gradient tracking

        # Keep everything on the same device (GPU)
        predicted_flat = predicted.view(-1)
        labels_flat = labels.view(-1)

        # Compute accuracy for all nodes (1 for correct predictions, 0 for incorrect)
        correct_predictions = (predicted_flat == labels_flat).int()

        # Use subgraph_data.batch to group predictions by subgraph (make sure it's on the same device)
        num_subgraphs = len(subgraph_data.batch.unique())
        nodes_per_subgraph = subgraph_data.batch.bincount(minlength=num_subgraphs)  # Count nodes in each subgraph

        # Sum correct predictions per subgraph
        correct_per_subgraph = torch_scatter.scatter_add(correct_predictions, subgraph_data.batch, dim=0)

        # Calculate accuracy per subgraph
        subgraph_accuracies = correct_per_subgraph / nodes_per_subgraph

        # Append accuracies for each subgraph in the batch
        batch_accuracies.extend(subgraph_accuracies.cpu().tolist())  # Move the results to CPU

        # Compute precision, recall, and F1-score per subgraph
        for i in range(num_subgraphs):
            node_mask = (subgraph_data.batch == i)  # Get the mask for the current subgraph

            # Get the true labels and predictions for this subgraph
            labels_subgraph = labels_flat[node_mask].cpu().numpy()  # Convert to numpy for sklearn
            predicted_subgraph = predicted_flat[node_mask].cpu().numpy()

            if len(set(labels_subgraph)) > 1:  # Avoid cases where only one class exists
                precision = precision_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                recall = recall_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                f1 = f1_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
            else:
                # Handle the case when all labels are the same (sklearn can raise a warning or return 0)
                precision, recall, f1 = 0.0, 0.0, 0.0

            # Store the precision, recall, and F1-score
            batch_precisions.append(precision)
            batch_recalls.append(recall)
            batch_f1s.append(f1)

# Compute the average accuracy, precision, recall, and F1-score over all subgraphs in the batch
average_accuracy = sum(batch_accuracies) / len(batch_accuracies)
average_precision = sum(batch_precisions) / len(batch_precisions)
average_recall = sum(batch_recalls) / len(batch_recalls)
average_f1 = sum(batch_f1s) / len(batch_f1s)

print(f"Average Accuracy: {average_accuracy:.8f}")
print(f"Average Precision: {average_precision:.8f}")
print(f"Average Recall: {average_recall:.8f}")
print(f"Average F1-Score: {average_f1:.8f}")


Validation: 100%|██████████| 20/20 [00:36<00:00,  1.84s/it]

Average Accuracy: 0.99727723
Average Precision: 0.00000000
Average Recall: 0.00000000
Average F1-Score: 0.00000000





In [28]:
test = functions.KGQADataset(path_to_node_embed, path_to_idxes, '../Datasets/MetaQA_dataset/vanilla 3-hop/qa_test.txt')
sub_data3 = torch.utils.data.Subset(test, list(range(1601, 1920)))
dataloader_test = DataLoader(sub_data3, batch_size=16, collate_fn=functions.collate_fn, shuffle=True)

In [None]:
import torch_scatter
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()  # Set the model to evaluation mode
batch_accuracies = []  # List to store accuracy of each question/subgraph
batch_precisions = []  # List to store precision of each subgraph
batch_recalls = []  # List to store recall of each subgraph
batch_f1s = []  # List to store F1-score of each subgraph

# Add tqdm for batches within the epoch
progress_bar = tqdm(dataloader_test, desc="Testing")

with torch.no_grad():
    for batch in progress_bar:
        subgraph_data, question_embeddings, labels, _ = batch  # Assuming labels are in the batch

        # Move data to the device
        subgraph_data = subgraph_data.to(device)
        question_embeddings = question_embeddings.to(device)
        labels = labels.to(device)  # Move the ground truth labels to the device

        # Forward pass
        out = model(subgraph_data, question_embeddings)

        # Detach the predicted labels
        predicted = torch.argmax(out, dim=1).detach()  # Detach to avoid gradient tracking

        # Keep everything on the same device (GPU)
        predicted_flat = predicted.view(-1)
        labels_flat = labels.view(-1)

        # Compute accuracy for all nodes (1 for correct predictions, 0 for incorrect)
        correct_predictions = (predicted_flat == labels_flat).int()

        # Use subgraph_data.batch to group predictions by subgraph (make sure it's on the same device)
        num_subgraphs = len(subgraph_data.batch.unique())
        nodes_per_subgraph = subgraph_data.batch.bincount(minlength=num_subgraphs)  # Count nodes in each subgraph

        # Sum correct predictions per subgraph
        correct_per_subgraph = torch_scatter.scatter_add(correct_predictions, subgraph_data.batch, dim=0)

        # Calculate accuracy per subgraph
        subgraph_accuracies = correct_per_subgraph / nodes_per_subgraph

        # Append accuracies for each subgraph in the batch
        batch_accuracies.extend(subgraph_accuracies.cpu().tolist())  # Move the results to CPU

        # Compute precision, recall, and F1-score per subgraph
        for i in range(num_subgraphs):
            node_mask = (subgraph_data.batch == i)  # Get the mask for the current subgraph

            # Get the true labels and predictions for this subgraph
            labels_subgraph = labels_flat[node_mask].cpu().numpy()  # Convert to numpy for sklearn
            predicted_subgraph = predicted_flat[node_mask].cpu().numpy()

            if len(set(labels_subgraph)) > 1:  # Avoid cases where only one class exists
                precision = precision_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                recall = recall_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
                f1 = f1_score(labels_subgraph, predicted_subgraph, average='binary', zero_division=0)
            else:
                # Handle the case when all labels are the same (sklearn can raise a warning or return 0)
                precision, recall, f1 = 0.0, 0.0, 0.0

            # Store the precision, recall, and F1-score
            batch_precisions.append(precision)
            batch_recalls.append(recall)
            batch_f1s.append(f1)

# Compute the average accuracy, precision, recall, and F1-score over all subgraphs in the batch
average_accuracy = sum(batch_accuracies) / len(batch_accuracies)
average_precision = sum(batch_precisions) / len(batch_precisions)
average_recall = sum(batch_recalls) / len(batch_recalls)
average_f1 = sum(batch_f1s) / len(batch_f1s)

print(f"Average Accuracy: {average_accuracy:.8f}")
print(f"Average Precision: {average_precision:.8f}")
print(f"Average Recall: {average_recall:.8f}")
print(f"Average F1-Score: {average_f1:.8f}")
