In [1]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
import copy
from tqdm import tqdm
import re


from torch_geometric.data import HeteroData, Batch
from torch_geometric.transforms import ToUndirected
from torch_geometric.loader import LinkNeighborLoader
from sklearn.manifold import TSNE

In [2]:
# =========================
# 1. Build the Heterogeneous Graph
# =========================

def get_graph(dataset_folder):
    # Load edge data (convert lists to tensors if necessary)
    # Recipe -> Ingredient edges
    r_i_src, r_i_dst, r_i_weight = torch.load(os.path.join(dataset_folder, 'edge_r2i_src_dst_weight.pt'))
    r_i_src = torch.tensor(r_i_src, dtype=torch.long)
    r_i_dst = torch.tensor(r_i_dst, dtype=torch.long)
    r_i_weight = torch.tensor(r_i_weight, dtype=torch.float)

    # Recipe -> Recipe edges
    r_r_src, r_r_dst, r_r_weight = torch.load(os.path.join(dataset_folder, 'edge_r2r_src_and_dst_and_weight.pt'))
    r_r_src = torch.tensor(r_r_src, dtype=torch.long)
    r_r_dst = torch.tensor(r_r_dst, dtype=torch.long)
    r_r_weight = torch.tensor(r_r_weight, dtype=torch.float)

    # Ingredient -> Ingredient edges
    i_i_src, i_i_dst, i_i_weight = torch.load(os.path.join(dataset_folder, 'edge_i2i_src_and_dst_and_weight.pt'))
    i_i_src = torch.tensor(i_i_src, dtype=torch.long)
    i_i_dst = torch.tensor(i_i_dst, dtype=torch.long)
    i_i_weight = torch.tensor(i_i_weight, dtype=torch.float)

    # User -> Recipe edges (all, train, val, test splits)
    all_u2r = torch.load(os.path.join(dataset_folder, 'all_train_val_test_edge_u_rate_r_src_and_dst_and_weight.pt'))
    # all_u2r is a tuple: (all, train, val, test)
    all_u2r_src_dst_weight = all_u2r[0]
    train_u2r_src_dst_weight = all_u2r[1]
    val_u2r_src_dst_weight = all_u2r[2]
    test_u2r_src_dst_weight = all_u2r[3]
    
    # For the full graph, we use "all" edges:
    u_rate_r_src, u_rate_r_dst, u_rate_r_weight = all_u2r_src_dst_weight
    u_rate_r_src = torch.tensor(u_rate_r_src, dtype=torch.long)
    u_rate_r_dst = torch.tensor(u_rate_r_dst, dtype=torch.long)
    u_rate_r_weight = torch.tensor(u_rate_r_weight, dtype=torch.float)

    # Number of nodes (from your logs)
    num_recipe = 68794
    num_ingredient = 8847
    num_user = 7959

    data = HeteroData()
    data['recipe'].num_nodes = num_recipe
    data['ingredient'].num_nodes = num_ingredient
    data['user'].num_nodes = num_user

    # Add edges for each relation
    # recipe -> ingredient ("r-i")
    data['recipe', 'r-i', 'ingredient'].edge_index = torch.stack([r_i_src, r_i_dst], dim=0)
    data['recipe', 'r-i', 'ingredient'].edge_weight = r_i_weight
    # ingredient -> recipe ("i-r")
    data['ingredient', 'i-r', 'recipe'].edge_index = torch.stack([r_i_dst, r_i_src], dim=0)
    data['ingredient', 'i-r', 'recipe'].edge_weight = r_i_weight

    # recipe -> recipe ("r-r")
    data['recipe', 'r-r', 'recipe'].edge_index = torch.stack([r_r_src, r_r_dst], dim=0)
    data['recipe', 'r-r', 'recipe'].edge_weight = r_r_weight

    # ingredient -> ingredient ("i-i")
    data['ingredient', 'i-i', 'ingredient'].edge_index = torch.stack([i_i_src, i_i_dst], dim=0)
    data['ingredient', 'i-i', 'ingredient'].edge_weight = i_i_weight

    # user -> recipe ("u-r")
    data['user', 'u-r', 'recipe'].edge_index = torch.stack([u_rate_r_src, u_rate_r_dst], dim=0)
    data['user', 'u-r', 'recipe'].edge_weight = u_rate_r_weight

    # recipe -> user ("r-u")
    data['recipe', 'r-u', 'user'].edge_index = torch.stack([u_rate_r_dst, u_rate_r_src], dim=0)
    data['recipe', 'r-u', 'user'].edge_weight = u_rate_r_weight

    # (Optional) Make the graph undirected if desired.
    # data = ToUndirected()(data)

    # Load node features
    recipe_instr_features = torch.load(os.path.join(dataset_folder, 'recipe_nodes_avg_instruction_features.pt'))
    ingredient_nutrient_features = torch.load(os.path.join(dataset_folder, 'ingredient_nodes_nutrient_features.pt'))
    data['recipe'].x = recipe_instr_features   # shape: [68794, feature_dim_recipe]
    data['ingredient'].x = ingredient_nutrient_features  # shape: [8847, feature_dim_ingredient]

    return data, all_u2r, train_u2r_src_dst_weight, val_u2r_src_dst_weight, test_u2r_src_dst_weight

dataset_folder = "data/"
data, all_u2r, train_u2r_src_dst_weight, val_u2r_src_dst_weight, test_u2r_src_dst_weight = get_graph(dataset_folder)
print(data)

HeteroData(
  recipe={
    num_nodes=68794,
    x=[68794, 1024],
  },
  ingredient={
    num_nodes=8847,
    x=[8847, 46],
  },
  user={ num_nodes=7959 },
  (recipe, r-i, ingredient)={
    edge_index=[2, 463485],
    edge_weight=[463485],
  },
  (ingredient, i-r, recipe)={
    edge_index=[2, 463485],
    edge_weight=[463485],
  },
  (recipe, r-r, recipe)={
    edge_index=[2, 647146],
    edge_weight=[647146],
  },
  (ingredient, i-i, ingredient)={
    edge_index=[2, 146188],
    edge_weight=[146188],
  },
  (user, u-r, recipe)={
    edge_index=[2, 135353],
    edge_weight=[135353],
  },
  (recipe, r-u, user)={
    edge_index=[2, 135353],
    edge_weight=[135353],
  }
)


In [3]:
for node_type in data.node_types:
    print(f"{node_type} has {data[node_type].num_nodes} nodes, x shape = {data[node_type].x.shape if 'x' in data[node_type] else 'no features'}")

for edge_type in data.edge_types:
    edge_idx = data[edge_type].edge_index
    print(f"Edge type {edge_type} has {edge_idx.size(1)} edges, edge_weight shape = {data[edge_type].edge_weight.shape if 'edge_weight' in data[edge_type] else 'no weights'}")

recipe has 68794 nodes, x shape = torch.Size([68794, 1024])
ingredient has 8847 nodes, x shape = torch.Size([8847, 46])
user has 7959 nodes, x shape = no features
Edge type ('recipe', 'r-i', 'ingredient') has 463485 edges, edge_weight shape = torch.Size([463485])
Edge type ('ingredient', 'i-r', 'recipe') has 463485 edges, edge_weight shape = torch.Size([463485])
Edge type ('recipe', 'r-r', 'recipe') has 647146 edges, edge_weight shape = torch.Size([647146])
Edge type ('ingredient', 'i-i', 'ingredient') has 146188 edges, edge_weight shape = torch.Size([146188])
Edge type ('user', 'u-r', 'recipe') has 135353 edges, edge_weight shape = torch.Size([135353])
Edge type ('recipe', 'r-u', 'user') has 135353 edges, edge_weight shape = torch.Size([135353])


In [4]:
def create_split_data(data, split_u2r):
    """
    Given a HeteroData object and a tuple split_u2r = (src, dst, weight),
    update the "u-r" and "r-u" relations with the provided split.
    """
    split_src, split_dst, split_weight = split_u2r
    split_src = torch.tensor(split_src, dtype=torch.long)
    split_dst = torch.tensor(split_dst, dtype=torch.long)
    split_weight = torch.tensor(split_weight, dtype=torch.float)
    
    data['user', 'u-r', 'recipe'].edge_index = torch.stack([split_src, split_dst], dim=0)
    data['user', 'u-r', 'recipe'].edge_weight = split_weight
    
    data['recipe', 'r-u', 'user'].edge_index = torch.stack([split_dst, split_src], dim=0)
    data['recipe', 'r-u', 'user'].edge_weight = split_weight
    return data

train_data = create_split_data(copy.deepcopy(data), train_u2r_src_dst_weight)
val_data   = create_split_data(copy.deepcopy(data), val_u2r_src_dst_weight)
test_data  = create_split_data(copy.deepcopy(data), test_u2r_src_dst_weight)


In [5]:
from torch_geometric.loader import LinkNeighborLoader
def unify_train_val_collate(data_list):
    """
    A collate function that performs the default PyG collation, 
    then checks if PyG assigned an attribute `edge_label`.
    If so, we copy it into `edge_attr`.
    """
    batch = Batch.from_data_list(data_list)
    edge_store = batch['user','u-r','recipe']
    # If PyG's negative sampler attached `.edge_label`, unify it into `edge_attr`.
    if hasattr(edge_store, 'edge_label'):
        edge_store.edge_attr = getattr(edge_store, 'edge_label')
    return batch

# For training, we use train_data
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 20],
    edge_label_index=('user','u-r','recipe'),
    batch_size=1024,
    shuffle=True,
    neg_sampling_ratio=5,   # built-in negative sampling
    collate_fn=unify_train_val_collate
)

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 20],
    edge_label_index=('user','u-r','recipe'),
    batch_size=128,
    shuffle=False,
    neg_sampling_ratio=5,   # built-in negative sampling
    collate_fn=unify_train_val_collate
)

print('# of batches in train_loader:', len(train_loader))
print('# of batches in val_loader:', len(val_loader))

# of batches in train_loader: 117
# of batches in val_loader: 63


In [13]:
n_test_negs = 100  # number of negatives per positive edge for testing
dataset_folder = "data/"

def load_test_negatives(dataset_folder):
    """
    Loads a dictionary mapping each user ID to a list of negative recipe IDs.
    Expects the file 'test_negatives_100.txt' in dataset_folder.
    """
    user2negs = {}
    filename = os.path.join(dataset_folder, 'test_negatives_100.txt')
    with open(filename, "r") as f:
        lines = f.readlines()
        for line in tqdm(lines, desc="Loading test negatives"):
            line = line.strip()
            if not line:
                continue
            # Assume the first token looks like "(123)" or "u123"
            user_str = line.split('\t')[0]
            user = int(re.sub(r'\D', '', user_str))
            negs = [int(neg) for neg in line.split('\t')[1:]]
            user2negs[user] = negs
    return user2negs

user2negs = load_test_negatives(dataset_folder)

test_edge_store = test_data['user','u-r','recipe']
test_pos_edges = test_edge_store.edge_index  # shape [2, num_test_pos]
# Row 0 = user IDs, row 1 = recipe IDs

# If your test split is a one-edge-per-user scenario, you might gather a dict:
# user2pos = { user_id: recipe_id }
# or if a user can have multiple test recipes, store them in a list.

user2pos = {}
src = test_pos_edges[0].tolist()
dst = test_pos_edges[1].tolist()
for u,r in zip(src, dst):
    user2pos.setdefault(u, []).append(r)  # handle multi-positives if needed

print("Number of test users in user2pos:", len(user2pos))
# Build test_loader with neg_sampling_ratio=0 and our custom collate function.
# `merged_edge_index` is your shape [2, E] tensor
test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 20],
    edge_label_index=( ('user','u-r','recipe'), merged_edge_index ),
    edge_label=merged_labels,  # shape [E]
    batch_size=128,
    shuffle=False,
    neg_sampling_ratio=0,
)

print('# of batches in test_loader:', len(test_loader))

Loading test negatives: 100%|██████████| 7959/7959 [00:00<00:00, 59366.13it/s]


Constructed negative edges: 795900
Total test edges after merging pos+neg: 803859
# of batches in test_loader: 6281


In [16]:
for batch in train_loader:
    print("Batch keys:", batch.keys())
    print("Edge index shape:", batch['user', 'u-r', 'recipe'].edge_index.shape)
    print("Edge label shape:", batch['user', 'u-r', 'recipe'].edge_label.shape)
    break

Batch keys: ['edge_label_index', 'edge_weight', 'edge_index', 'num_nodes', 'x', 'input_id', 'e_id', 'edge_label', 'n_id']
Edge index shape: torch.Size([2, 73813])
Edge label shape: torch.Size([6144])


In [None]:
import torch
import time
import numpy as np
from src.models.baseRecommender import Recommender

# 1) Define a Trivial Model with Random Scores
class TrivialModel(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, batch):
        # This model returns random scores for each edge between 0 and 1
        edge_index = batch['user', 'u-r', 'recipe'].edge_index
        num_edges = edge_index.size(1)
        return torch.rand(num_edges, device=edge_index.device)  # Random scores in [0, 1]

# 2) Define a Trivial Recommender using the Random Model
class TrivialRecommender(Recommender):
    def __init__(self, device='cpu'):
        super().__init__(device=device)
        self.model = TrivialModel().to(device)

    def fit(self, data):
        # No training needed
        pass

# 3) Instantiate & Evaluate
trivial_recommender = TrivialRecommender(device='cpu')

print("Evaluating trivial recommender on validation split...")

val_start = time.time()
val_metrics = trivial_recommender.evaluate(val_loader, topk=[1,2,3,5,10], verbose=True)
val_end = time.time()

print(f"Trivial model evaluation on validation set took {val_end - val_start:.2f} seconds.")
print("Validation Metrics:", val_metrics)

Evaluating trivial recommender on validation split...
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positives, 0 negatives
Batch has 128 positiv

In [9]:
for batch in val_loader:
    edge_scores = trivial_recommender.model(batch)  # Model prediction scores
    print(edge_scores[:10])  # Check the first few scores
    break

tensor([0.7438, 0.2415, 0.2863, 0.7740, 0.9380, 0.7662, 0.0797, 0.5724, 0.7723,
        0.3471])


In [10]:
%%time
num_sample = 10000
indices = torch.randperm(num_recipes)[:num_sample]
sample_emb = data["recipe"].x[indices].detach().cpu().numpy()

# Perform TSNE in 2D
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
emb_2d = tsne.fit_transform(sample_emb)

# Plot
plt.figure(figsize=(7,6))
plt.scatter(emb_2d[:,0], emb_2d[:,1], s=5, alpha=0.7, c="blue")
plt.title(f"t-SNE of Recipe Embeddings (sample of {num_sample})")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

NameError: name 'num_recipes' is not defined