In [101]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize


from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected
from sklearn.manifold import TSNE

# Install faiss
%pip install faiss-cpu
import faiss


Note: you may need to restart the kernel to use updated packages.


In [102]:
dataset_folder = "data/"

# 1. Load edge data (Recipe–Ingredient)
r_i_src, r_i_dst, r_i_weight = torch.load(
    os.path.join(dataset_folder, "edge_r2i_src_dst_weight.pt")
)

# 2. Load edge data (Recipe–Recipe)
r_r_src, r_r_dst, r_r_weight = torch.load(
    os.path.join(dataset_folder, "edge_r2r_src_and_dst_and_weight.pt")
)

# 3. Load edge data (Ingredient–Ingredient)
i_i_src, i_i_dst, i_i_weight = torch.load(
    os.path.join(dataset_folder, "edge_i2i_src_and_dst_and_weight.pt")
)

# 4. Load edge data (User–Recipe)
all_u2r_src_dst_weight = torch.load(
    os.path.join(dataset_folder, "all_train_val_test_edge_u_rate_r_src_and_dst_and_weight.pt")
)
all_u2r_src, all_u2r_dst, all_u2r_weight = all_u2r_src_dst_weight[0]

# 5. Load node data (Recipe/Ingredient)
recipe_instr_features = torch.load(
    os.path.join(dataset_folder, "recipe_nodes_avg_instruction_features.pt")
)
ingredient_nutrient_features = torch.load(
    os.path.join(dataset_folder, "ingredient_nodes_nutrient_features.pt")
)

data = HeteroData()

# Convert lists to tensors if necessary
all_u2r_src = torch.tensor(all_u2r_src, dtype=torch.long)
all_u2r_dst = torch.tensor(all_u2r_dst, dtype=torch.long)
all_u2r_weight = torch.tensor(all_u2r_weight, dtype=torch.float)

r_i_src = torch.tensor(r_i_src, dtype=torch.long)
r_i_dst = torch.tensor(r_i_dst, dtype=torch.long)
r_i_weight = torch.tensor(r_i_weight, dtype=torch.float)

r_r_src = torch.tensor(r_r_src, dtype=torch.long)
r_r_dst = torch.tensor(r_r_dst, dtype=torch.long)
r_r_weight = torch.tensor(r_r_weight, dtype=torch.float)

i_i_src = torch.tensor(i_i_src, dtype=torch.long)
i_i_dst = torch.tensor(i_i_dst, dtype=torch.long)
i_i_weight = torch.tensor(i_i_weight, dtype=torch.float)

# NODES:
# We often need to define the number of nodes per type.
# For example, from your logs:
num_users = 7959
num_recipes = 68794
num_ingredients = 8847

data["user"].num_nodes = num_users
data["recipe"].num_nodes = num_recipes
data["ingredient"].num_nodes = num_ingredients

# EDGES:
# user -> recipe
data["user", "u-r", "recipe"].edge_index = torch.stack([all_u2r_src, all_u2r_dst], dim=0)
data["user", "u-r", "recipe"].edge_weight = all_u2r_weight

# recipe -> ingredient
data["recipe", "r-i", "ingredient"].edge_index = torch.stack([r_i_src, r_i_dst], dim=0)
data["recipe", "r-i", "ingredient"].edge_weight = r_i_weight

# recipe -> recipe
data["recipe", "r-r", "recipe"].edge_index = torch.stack([r_r_src, r_r_dst], dim=0)
data["recipe", "r-r", "recipe"].edge_weight = r_r_weight

# ingredient -> ingredient
data["ingredient", "i-i", "ingredient"].edge_index = torch.stack([i_i_src, i_i_dst], dim=0)
data["ingredient", "i-i", "ingredient"].edge_weight = i_i_weight

# If you want to make the graph fully bidirectional:
data = ToUndirected()(data)

# recipe_instr_features: [68794, feature_dim_recipe]
# ingredient_nutrient_features: [8847, feature_dim_ingr]
data["recipe"].x = recipe_instr_features
data["ingredient"].x = ingredient_nutrient_features

# Suppose we use random user features, e.g., shape [7959, 300]:
#user_feat = torch.rand(num_users, 300)
#data["user"].x = user_feat

In [103]:
# Load the HeteroData graph (assumed to be preloaded as 'data')
num_users = data["user"].num_nodes
num_recipes = data["recipe"].num_nodes

# Extract user-recipe interaction data
user_recipe_src = data["user", "u-r", "recipe"].edge_index[0]
user_recipe_dst = data["user", "u-r", "recipe"].edge_index[1]
user_recipe_weight = data["user", "u-r", "recipe"].edge_weight



In [104]:

# Create a sparse user-recipe interaction matrix
interaction_matrix = csr_matrix((user_recipe_weight.numpy(), (user_recipe_src.numpy(), user_recipe_dst.numpy())),
                                shape=(num_users, num_recipes))

# Normalize interaction matrix
interaction_matrix = normalize(interaction_matrix, norm='l2', axis=1)

# Convert to dense numpy array (only if memory allows)
user_embeddings = interaction_matrix.toarray().astype(np.float32)

# Build FAISS index for fast k-NN search
index = faiss.IndexFlatIP(num_recipes)  # Inner product for cosine similarity
index.add(user_embeddings)



## Version #1

In [105]:
# Function to get top-k recommended recipes for a given user
def recommend_recipes(user_id, k=5):
    if user_id >= num_users:
        raise ValueError("User ID out of range.")
    
    # Get top-k similar users
    _, nearest_users = index.search(user_embeddings[user_id].reshape(1, -1), k+1)
    nearest_users = nearest_users.flatten()[1:]  # Exclude self
    
    # Aggregate recipe interactions from similar users
    recommended_recipes = set()
    for similar_user in nearest_users:
        user_interactions = interaction_matrix[similar_user].nonzero()[1]  # Get recipe indices
        recommended_recipes.update(user_interactions)
    
    return list(recommended_recipes)[:k]

In [106]:
# Example: Get top 5 recommendations for user 0
user_id = 0
recommended_recipes = recommend_recipes(user_id, k=5)
print(f"Recommended recipes for User {user_id}: {recommended_recipes}")

Recommended recipes for User 0: [np.int32(41856), np.int32(2), np.int32(260), np.int32(261), np.int32(262)]


Testing

In [107]:
max_recipe_id = num_recipes - 1
print(all(idx < num_recipes for idx in recommended_recipes))  # Should print True

True


In [108]:
_, nearest_users = index.search(user_embeddings[0].reshape(1, -1), 6)  # Get top 5 similar users
print(f"Top similar users to User 0: {nearest_users.flatten()[1:]}")  # Exclude itself

Top similar users to User 0: [7174 6037 5411 3273    5]


In [109]:
for u in nearest_users.flatten()[1:]:
    print(f"User {u} interacted with recipes: {interaction_matrix[u].nonzero()[1]}")

User 7174 interacted with recipes: [    2 34715 51795]
User 6037 interacted with recipes: [    2  9560 51268 68168]
User 5411 interacted with recipes: [    2  4068 24428 67297]
User 3273 interacted with recipes: [    2  2037  5240  8173  8231 15801 17444 22565 25057 28952 29470 32007
 36728 37047 41188 41230 41856 42845 48877 54705 61792 61793 61794 61795
 61796 61797]
User 5 interacted with recipes: [260 261 262 263]


In [110]:
print(recommend_recipes(0, k=10))  # Should return more results
print(recommend_recipes(0, k=3))  # Should return fewer results

[np.int32(2), np.int32(22565), np.int32(8231), np.int32(51268), np.int32(37047), np.int32(41188), np.int32(498), np.int32(260), np.int32(261), np.int32(262)]
[np.int32(67297), np.int32(2), np.int32(51268)]


In [111]:
user_1_recommendations = recommend_recipes(1, k=5)
print(f"Recommended recipes for User 1: {user_1_recommendations}")

Recommended recipes for User 1: [np.int32(11), np.int32(37909), np.int32(22), np.int32(23), np.int32(8226)]


In [112]:
user_0_seen_recipes = set(interaction_matrix[0].nonzero()[1])
print("Already seen by user 0:", user_0_seen_recipes)
print("Recommended:", recommended_recipes)
print("Overlap:", set(recommended_recipes) & user_0_seen_recipes)  # Should be small or empty

Already seen by user 0: {np.int32(0), np.int32(1), np.int32(2), np.int32(3)}
Recommended: [np.int32(41856), np.int32(2), np.int32(260), np.int32(261), np.int32(262)]
Overlap: {np.int32(2)}


## Version #2

In [113]:
def get_popular_recipes(k):
    recipe_popularity = interaction_matrix.sum(axis=0).A1
    top_recipes = np.argsort(recipe_popularity)[-k*2:][::-1]  # Select more than needed
    return np.random.choice(top_recipes, k, replace=False).tolist()  # Random subset

def recommend_recipes(user_id, k=5):
    if user_id >= num_users:
        raise ValueError("User ID out of range.")
    
    seen_recipes = set(interaction_matrix[user_id].nonzero()[1])
    if len(seen_recipes) == 0:
        print(f"User {user_id} has no interactions. Recommending popular recipes.")
        return get_popular_recipes(k)

    _, nearest_users = index.search(user_embeddings[user_id].reshape(1, -1), k+1)
    nearest_users = nearest_users.flatten()[1:]  # Exclude self

    # Aggregate recipe interactions from similar users
    recommended_recipes = set()
    for similar_user in nearest_users:
        user_interactions = interaction_matrix[similar_user].nonzero()[1]
        recommended_recipes.update(user_interactions)

    # Remove already seen recipes
    recommended_recipes = list(recommended_recipes - seen_recipes)

    return recommended_recipes[:k]

In [114]:
# Example: Get top 5 recommendations for user 0
user_id = 0
recommended_recipes = recommend_recipes(user_id, k=5)
print(f"Recommended recipes for User {user_id}: {recommended_recipes}")

Recommended recipes for User 0: [np.int32(41856), np.int32(260), np.int32(261), np.int32(262), np.int32(32007)]


Testing

In [115]:
max_recipe_id = num_recipes - 1
print(all(idx < num_recipes for idx in recommended_recipes))  # Should print True

True


In [116]:
_, nearest_users = index.search(user_embeddings[0].reshape(1, -1), 6)  # Get top 5 similar users
print(f"Top similar users to User 0: {nearest_users.flatten()[1:]}")  # Exclude itself

Top similar users to User 0: [7174 6037 5411 3273    5]


In [117]:
for u in nearest_users.flatten()[1:]:
    print(f"User {u} interacted with recipes: {interaction_matrix[u].nonzero()[1]}")

User 7174 interacted with recipes: [    2 34715 51795]
User 6037 interacted with recipes: [    2  9560 51268 68168]
User 5411 interacted with recipes: [    2  4068 24428 67297]
User 3273 interacted with recipes: [    2  2037  5240  8173  8231 15801 17444 22565 25057 28952 29470 32007
 36728 37047 41188 41230 41856 42845 48877 54705 61792 61793 61794 61795
 61796 61797]
User 5 interacted with recipes: [260 261 262 263]


In [118]:
print(recommend_recipes(0, k=10))  # Should return more results
print(recommend_recipes(0, k=3))  # Should return fewer results

[np.int32(8231), np.int32(37047), np.int32(41188), np.int32(260), np.int32(261), np.int32(262), np.int32(263), np.int32(264), np.int32(265), np.int32(266)]
[np.int32(67297), np.int32(4068), np.int32(51268)]


In [119]:
user_1_recommendations = recommend_recipes(1, k=5)
print(f"Recommended recipes for User 1: {user_1_recommendations}")

Recommended recipes for User 1: [np.int32(37909), np.int32(8226), np.int32(21887), np.int32(33321), np.int32(2095)]


In [120]:
user_0_seen_recipes = set(interaction_matrix[0].nonzero()[1])
print("Already seen by user 0:", user_0_seen_recipes)
print("Recommended:", recommended_recipes)
print("Overlap:", set(recommended_recipes) & user_0_seen_recipes)  # Should be small or empty

Already seen by user 0: {np.int32(0), np.int32(1), np.int32(2), np.int32(3)}
Recommended: [np.int32(41856), np.int32(260), np.int32(261), np.int32(262), np.int32(32007)]
Overlap: set()


## Metrics

In [121]:
def leave_one_out_cv(user_id):
    """
    Perform Leave-One-Out Cross-Validation (LOO) by removing one interaction for testing.

    Args:
        user_id (int): User ID.

    Returns:
        tuple: (Train set, test item)
    """
    user_interactions = set(interaction_matrix[user_id].nonzero()[1])
    if len(user_interactions) < 2:
        return list(user_interactions), None  # Not enough data

    test_item = np.random.choice(list(user_interactions))  # Hold out one for testing
    train_set = list(user_interactions - {test_item})

    return train_set, test_item


In [122]:
def generate_negative_samples(user_id, num_samples=100):
    """
    Generate random negative samples (recipes NOT interacted with by the user).

    Args:
        user_id (int): User ID.
        num_samples (int): Number of negative samples.

    Returns:
        list: List of negative sample recipe IDs.
    """
    user_interactions = set(interaction_matrix[user_id].nonzero()[1])
    all_recipes = set(range(num_recipes))
    negative_samples = list(all_recipes - user_interactions)

    return np.random.choice(negative_samples, min(len(negative_samples), num_samples), replace=False)

In [123]:
# Leave in seen recipes for evaluation
def recommend_recipes(user_id, k=10):
    """
    Generate top-K recipe recommendations for a given user.

    Args:
        user_id (int): User ID.
        k (int): Number of recommendations.

    Returns:
        list: List of recommended recipe IDs.
    """
    if user_id >= num_users:
        raise ValueError("User ID out of range.")

    # Get top-K similar users
    _, nearest_users = index.search(user_embeddings[user_id].reshape(1, -1), k + 1)
    nearest_users = nearest_users.flatten()[1:]  # Exclude self

    # Aggregate interactions from similar users
    recommended_recipes = set()
    for similar_user in nearest_users:
        user_interactions = interaction_matrix[similar_user].nonzero()[1]  # Get recipe indices
        recommended_recipes.update(user_interactions)

    # Do NOT exclude seen recipes (since we need rankings)
    return sorted(recommended_recipes, key=lambda x: -interaction_matrix[user_id, x])[:k]

In [124]:
def precision_at_k(recommended, ground_truth, k):
    """
    Compute Precision@K.

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: Precision@K score.
    """
    if not ground_truth:
        return 0.0

    hits = len(set(recommended[:k]) & set(ground_truth))
    return hits / k

In [125]:
def hit_rate_at_k(recommended, ground_truth, k):
    """
    Compute Hit Rate@K.

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: Hit Rate@K score.
    """
    return 1.0 if set(recommended[:k]) & set(ground_truth) else 0.0

In [126]:
def ndcg_at_k(recommended, ground_truth, k):
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@K).

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: NDCG@K score.
    """
    def dcg(recommended, ground_truth, k):
        return sum((1 / np.log2(i + 2)) for i, item in enumerate(recommended[:k]) if item in ground_truth)

    ideal_dcg = dcg(sorted(ground_truth, reverse=True), ground_truth, k)
    return dcg(recommended, ground_truth, k) / ideal_dcg if ideal_dcg > 0 else 0.0

In [127]:
def map_at_k(recommended, ground_truth, k):
    """
    Compute Mean Average Precision@K (MAP@K).

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: MAP@K score.
    """
    precision_values = [precision_at_k(recommended, ground_truth, i + 1) for i in range(k) if recommended[i] in ground_truth]
    return np.mean(precision_values) if precision_values else 0.0

In [128]:
### 🚀 Run Full Evaluation 🚀 ###
user_id = 0
train_set, test_item = leave_one_out_cv(user_id)

if test_item:
    recommended = recommend_recipes(user_id, k=10)
    negative_samples = generate_negative_samples(user_id, num_samples=100)

    print(f"Precision@10: {precision_at_k(recommended, [test_item], 10):.4f}")
    print(f"Hit Rate@10: {hit_rate_at_k(recommended, [test_item], 10):.4f}")
    print(f"NDCG@10: {ndcg_at_k(recommended, [test_item], 10):.4f}")
    print(f"MAP@10: {map_at_k(recommended, [test_item], 10):.4f}")

Precision@10: 0.1000
Hit Rate@10: 1.0000
NDCG@10: 1.0000
MAP@10: 1.0000
