In [101]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize


from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected
from sklearn.manifold import TSNE

# Install faiss
%pip install faiss-cpu
import faiss


Note: you may need to restart the kernel to use updated packages.


In [184]:
import numpy as np
import faiss
from scipy.sparse import csr_matrix
from data_loader import load_split_data  # Corrected import

# Load and split the data
train_set, valid_set, test_set = load_split_data()

# Ensure train_set is a sparse matrix
train_set_sparse = csr_matrix(train_set)  # Now placed correctly

# Define number of users and recipes
num_users = train_set.shape[0]  # Users are rows
num_recipes = train_set.shape[1]  # Recipes are columns

# Convert sparse matrix to dense for FAISS indexing
train_set_dense = train_set_sparse.toarray()

# Build FAISS index for fast k-NN search
index = faiss.IndexFlatIP(num_recipes)  # Inner product for cosine similarity
index.add(train_set_dense)  # Ensure FAISS uses a dense representation

print("Data loaded and FAISS index built successfully.")


Data loaded and FAISS index built successfully.


## Version #2

In [177]:
def get_popular_recipes(k):
    """Return k popular recipes based on summed interactions."""
    recipe_popularity = train_set_sparse.sum(axis=0).A1  # Convert to dense array
    top_recipes = np.argsort(recipe_popularity)[-k * 2:][::-1]  # Select more than needed
    return np.random.choice(top_recipes, k, replace=False).tolist()  # Random subset

def recommend_recipes(user_id, k=5, filter_seen=True):
    """
    Recommend top-k recipes for a user based on similar user interactions.

    Args:
        user_id (int): User ID.
        k (int): Number of recommendations.
        filter_seen (bool): Whether to remove seen recipes (set to False for training).

    Returns:
        list: Recommended recipe IDs.
    """
    if user_id >= num_users:
        raise ValueError("User ID out of range.")

    # Extract seen recipes
    seen_recipes = set(train_set_sparse[user_id].indices)  # Get column indices (recipe interactions)
    
    if len(seen_recipes) == 0:
        print(f"User {user_id} has no interactions. Recommending popular recipes.")
        return get_popular_recipes(k)

    # Search for similar users using FAISS
    _, nearest_users = index.search(train_set_sparse[user_id].toarray().reshape(1, -1), k + 1)
    nearest_users = nearest_users.flatten()[1:]  # Exclude self

    recommended_recipes = []
    for similar_user in nearest_users:
        user_interactions = train_set_sparse[similar_user].indices  # Get column indices
        recommended_recipes.extend(user_interactions)

    # Deduplicate and sort by occurrence (recipes seen by multiple users rank higher)
    recommended_recipes = sorted(set(recommended_recipes), key=recommended_recipes.count, reverse=True)

    # Apply seen recipe filtering only if filter_seen is True
    if filter_seen:
        recommended_recipes = [r for r in recommended_recipes if r not in seen_recipes]

    # Prevent empty recommendation list
    if len(recommended_recipes) < k:
        recommended_recipes.extend(get_popular_recipes(k - len(recommended_recipes)))

    return recommended_recipes[:k]  # Ensure we return only k recommendations

# Example Usage:
user_id = 0

# Inference (default behavior: removes seen recipes)
recommended_recipes_inference = recommend_recipes(user_id, k=5)
print(f"Inference Recommendations for User {user_id}: {recommended_recipes_inference}")

# Training Mode (keeps seen recipes for ranking)
recommended_recipes_train = recommend_recipes(user_id, k=5, filter_seen=False)
print(f"Training Recommendations for User {user_id}: {recommended_recipes_train}")

Inference Recommendations for User 0: [np.int32(9428), np.int32(4115), np.int32(9773), np.int32(32790), np.int32(9777)]
Training Recommendations for User 0: [np.int32(9428), np.int32(9486), np.int32(9752), np.int32(4115), np.int32(9773)]


In [178]:
## Testing

max_recipe_id = num_recipes - 1

# Ensure all recommended recipes are valid IDs within range
recommended_recipes_train = recommend_recipes(0, k=5, filter_seen=False)
print(all(idx < num_recipes for idx in recommended_recipes_train))  # Should print True

# Get top 5 similar users using FAISS search on UserKNN
_, nearest_users = index.search(train_set_dense[0].reshape(1, -1), 6)  # Get top 5 similar users
print(f"Top similar users to User 0: {nearest_users.flatten()[1:]}")  # Exclude itself

# Check interactions of similar users
for u in nearest_users.flatten()[1:]:
    print(f"User {u} interacted with recipes: {train_set_sparse[u].indices}")

# Ensure different numbers of recommendations work correctly
print(recommend_recipes(0, k=10))  # Should return more results
print(recommend_recipes(0, k=3))  # Should return fewer results

# Check recommendations for another user
user_1_recommendations = recommend_recipes(1, k=5)
print(f"Recommended recipes for User 1: {user_1_recommendations}")

# Verify seen recipes for user 0
user_0_seen_recipes = set(train_set_sparse[0].indices)
print("Already seen by user 0:", user_0_seen_recipes)

# Compare recommended recipes with seen recipes to check filter behavior
print("Recommended:", recommended_recipes_train)
print("Overlap:", set(recommended_recipes_train) & user_0_seen_recipes)  # Should be small or empty

True
Top similar users to User 0: [4358 5772 2502 1774 5241]
User 4358 interacted with recipes: [ 9428  9486 18027 18158 20230 38969]
User 5772 interacted with recipes: [ 7341  7909  9752 13367 14540 15498 15926 16310 18072 18087 18088 18090
 20456 20462 22827 22828 25730 31898 34007 34349 35844 37822 38566 40527
 40591 40592 40593 40594 40595 40596 40597 40598 40599 40600 40601 40602
 40603 40604 40605]
User 2502 interacted with recipes: [ 1790 32790 49330]
User 1774 interacted with recipes: [18045 49147 57764]
User 5241 interacted with recipes: [ 318 1143 1383 1443 1447 1473 1481 1547 1726 1736 1743 1756 1800 1868
 2117 3452 3631 3642 3885 3887 3908 3924 4049 4052 4075 4115 4220 4297
 4360 4362 4404 4505 4524 4631 4644 4683 4741 4803 4864 5614 5624 5681
 5803 5823 5841 5906 5911 5981 6120 6141 6143 6201 6203 6273 6299 6312
 6333 6350 6382 6401 6440 6603 6665 7370 7466 8733 8789 8795 8829 8918
 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088
 9089 9090 9091 9092 

In [179]:
def precision_at_k(recommended, ground_truth, k):
    """
    Compute Precision@K.

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: Precision@K score.
    """
    if not ground_truth:
        return 0.0

    hits = len(set(recommended[:k]) & set(ground_truth))
    return hits / k


def hit_rate_at_k(recommended, ground_truth, k):
    """
    Compute Hit Rate@K.

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: Hit Rate@K score.
    """
    return 1.0 if set(recommended[:k]) & set(ground_truth) else 0.0


def ndcg_at_k(recommended, ground_truth, k):
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@K).

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: NDCG@K score.
    """
    def dcg(recommended, ground_truth, k):
        return sum((1 / np.log2(i + 2)) for i, item in enumerate(recommended[:k]) if item in ground_truth)

    ideal_dcg = dcg(sorted(ground_truth, reverse=True), ground_truth, k)
    return dcg(recommended, ground_truth, k) / ideal_dcg if ideal_dcg > 0 else 0.0


def map_at_k(recommended, ground_truth, k):
    """
    Compute Mean Average Precision@K (MAP@K).

    Args:
        recommended (list): Top-K recommended items.
        ground_truth (list): Ground truth items.
        k (int): Rank cutoff.

    Returns:
        float: MAP@K score.
    """
    precision_values = [precision_at_k(recommended, ground_truth, i + 1) for i in range(k) if recommended[i] in ground_truth]
    return np.mean(precision_values) if precision_values else 0.0



In [183]:
# Generate Negative Samples from Test Set
def generate_negative_samples_test(user_id, num_samples=100):
    """Generate random negative recipe samples from the test set."""
    user_interactions = set(test_set_sparse[user_id].indices)  # Use test set interactions
    all_recipes = set(range(num_recipes))
    negative_samples = list(all_recipes - user_interactions)  # Remove seen interactions

    return np.random.choice(negative_samples, min(len(negative_samples), num_samples), replace=False)


# Evaluate using test set with negative sampling
def leave_one_out_cv_test(user_id):
    """Perform Leave-One-Out Cross-Validation (LOO) using test set."""
    user_interactions = set(test_set_sparse[user_id].indices)  # Use test set interactions
    if len(user_interactions) < 2:
        return list(user_interactions), None  # Not enough data

    test_item = np.random.choice(list(user_interactions))  # Hold out one item
    test_set_user = list(user_interactions - {test_item})  # Remaining interactions

    return test_set_user, test_item

# Run Evaluation on Test Set
user_id = 0
test_set_user, test_item = leave_one_out_cv_test(user_id)

if test_item:
    recommended = recommend_recipes(user_id, k=10)
    negative_samples = generate_negative_samples_test(user_id, num_samples=100)

    print(f"Precision@10: {precision_at_k(recommended, [test_item], 10):.4f}")
    print(f"Hit Rate@10: {hit_rate_at_k(recommended, [test_item], 10):.4f}")
    print(f"NDCG@10: {ndcg_at_k(recommended, [test_item], 10):.4f}")
    print(f"MAP@10: {map_at_k(recommended, [test_item], 10):.4f}")

    print(f"Negative Samples for User {user_id}: {negative_samples}")  # Debugging Output

Precision@10: 0.0000
Hit Rate@10: 0.0000
NDCG@10: 0.0000
MAP@10: 0.0000
Negative Samples for User 0: [18940 30064 36376 60742 50177 64908  6066 47345 63740  4815 31770  4363
 68250   227  8133 11755 26242 53733 31498 25715 34652 19243  5314  7520
 45553 49975 18676 63667   960 58663  1339 11192 60084 48871 55196 10985
 24924 58653  6415 53152 21729 33143 14951 38081  6496 14623  4549 40082
 66104 16422  6149 21848  8362  4548 57950 48915 39580 28692 34284 45600
 55028  8479 23513 68139 11155 22625  6293 64341 67426 43923 42447  7714
 40040  7722 10075 56747  7834 66439 20725  3304 42144 29419 28846 19714
 39593 53626 59527 36885 16189 16791 24992 50104 42148  4063 58610 30019
 37904  4298 17534 39318]


In [186]:
# Generate Negative Samples from Test Set
def generate_negative_samples_test(user_id, num_samples=100):
    """Generate random negative recipe samples from the test set."""
    user_interactions = set(test_set[user_id].indices)
    all_recipes = set(range(num_recipes))
    negative_samples = list(all_recipes - user_interactions)

    return np.random.choice(negative_samples, min(len(negative_samples), num_samples), replace=False)

# Evaluate using test set with negative sampling
def leave_one_out_cv_test(user_id):
    """Perform Leave-One-Out Cross-Validation (LOO) using test set."""
    user_interactions = set(test_set[user_id].indices)
    if len(user_interactions) < 2:
        return list(user_interactions), None

    test_item = np.random.choice(list(user_interactions))
    test_set_user = list(user_interactions - {test_item})

    return test_set_user, test_item

if __name__ == "__main__":
    train_set, valid_set, test_set = load_split_data()
    
    # Ensure train_set is in sparse format
    train_set_sparse = csr_matrix(train_set)
    test_set_sparse = csr_matrix(test_set)

    print("Data loaded and FAISS index built successfully.")

    # Build FAISS index for fast k-NN search
    num_users, num_recipes = train_set.shape
    index = faiss.IndexFlatIP(num_recipes)  # Inner product for cosine similarity
    index.add(train_set_sparse.toarray())  # Convert sparse to dense for FAISS

    # Run Evaluation on Test Set
    user_id = 0
    test_set_user, test_item = leave_one_out_cv_test(user_id)

    if test_item:
        recommended = recommend_recipes(user_id, k=10)
        negative_samples = generate_negative_samples_test(user_id, num_samples=100)

        print(f"Precision@10: {precision_at_k(recommended, [test_item], 10):.4f}")
        print(f"Hit Rate@10: {hit_rate_at_k(recommended, [test_item], 10):.4f}")
        print(f"NDCG@10: {ndcg_at_k(recommended, [test_item], 10):.4f}")
        print(f"MAP@10: {map_at_k(recommended, [test_item], 10):.4f}")
        print(f"Negative Samples for User {user_id}: {negative_samples}")

Data loaded and FAISS index built successfully.


AttributeError: 'numpy.ndarray' object has no attribute 'indices'