In [3]:
# enables auto-reload of files (%...function MUST BE WITHOUT SPACE!)
%load_ext autoreload
%autoreload 2

In [None]:
pip install pandarallel

In [None]:
pip install tqdm

In [None]:
pip install pandas

In [4]:
import warnings
import utils
import score_util
from  exploration import exploration_util
import matplotlib.pyplot as plt
import matplotlib.ticker as pltt
import seaborn as sns
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [5]:
# Define data locations
data_location = '../../data/hummus_data/'
graph_location = '../../data/food_kg/'
additional_location = '../../data/hummus_data/' # if not present set recipe_tags=False

-----------------------------------------------------------
# Load & clean Dataset
-----------------------------------------------------------

In [None]:
# Import & clean data -- use k_user = 10, k_recipe=10
recipes_df, reviews_df, users_df, recipes_dict, user_dict, food_locator_dict, food_com_dict, data = utils.load_and_clean_data(data_location, additional_location, k_user=10, k_recipe=10,
                                                                                  add_recipe_columns=['food_kg_locator'],
                                                                                  authorship_relations=6, recipe_tags=True, debug=True)


### Create Sparsity Table (NEW CODE)

In [None]:
#Finalized Sparsity Generation Code (New Code)

from scipy.sparse import csr_matrix
sparsities = pd.DataFrame(index=['(10,10)','(100,2)','(2,50)','(2,100)'], columns=['#recipes','#users','#interac','sparsity'])

k_values = [(10,10),(100,2),(2,50),(2,100)]

for k in k_values:
    # Import & clean data -- use k_user = 10, k_recipe=10
    recipes_df, reviews_df, users_df, recipes_dict, user_dict, food_locator_dict, food_com_dict, data = utils.load_and_clean_data(data_location, additional_location, k_user=k[0], k_recipe=k[1],
                                                                                    add_recipe_columns=['food_kg_locator'],
                                                                                    authorship_relations=6, recipe_tags=True, debug=True)
    
    # Build a sparse matrix (user x recipe x ratings)
    user_recipe_matrix = csr_matrix((reviews_df['rating'], (reviews_df['new_member_id'], reviews_df['new_recipe_id'])))

    # All users and recipes
    users = list(user_dict.keys())
    recipes = list(recipes_dict.keys())

    # Calculate the sparsity [in %]
    A = user_recipe_matrix.toarray()
    sparsity = 1.0 - (np.count_nonzero(A) / float(A.size) )

    index = str(k)
    sparsities.loc[index, '#recipes'] = len(recipes)
    sparsities.loc[index, '#users'] = len(users)
    sparsities.loc[index, '#interac'] = reviews_df.shape[0]
    sparsities.loc[index, 'sparsity'] = (sparsity * 100) 

display(sparsities[4:])

---

In [None]:
# Compute a dataframe consisting of nutrients normalized on 100g (takes ~15min), only necessary for the food score calculation
normalized_ingredients = utils.normalize_ingredients(recipes_df)

In [None]:
# Get all foodKG ingredient links
ingredients_df, ingredients_dict, mapped_recipes = utils.load_ingredient_dict(recipes_df, graph_location)

In [None]:
# Get food product label dict (from our data set)
label_dict = utils.load_ingredient_tags(graph_location)

-----------------------------------------------------------
# Calculate food scores
-----------------------------------------------------------

In [None]:
# Calculate food scores.
recipes_df = score_util.calculate_food_scores(recipes_df, normalized_ingredients, score_names=['who', 'fsa', 'nutri'], normalize=True)
recipes_df.head()

In [None]:
# Plot the score distribution histogram
plt.hist(recipes_df['who_score'], bins=10)
# Add labels and title
plt.xlabel('Healthiness Score')
plt.ylabel('Count')
plt.title('Healthiness Score Distribution')
# Show the plot
plt.show()

In [None]:
# Store preprocessed data
recipes_df.to_csv(data_location + 'pp_recipes.csv')
users_df.to_csv(data_location + 'pp_members.csv')
reviews_df.to_csv(data_location + 'pp_reviews.csv')

-----------------------------------------------------------
# Explore data set
-----------------------------------------------------------

In [1]:
# Merge data (again, because scores were not available before)
data_scores = pd.merge(recipes_df, reviews_df, right_on='new_recipe_id', left_on='new_recipe_id')

# Copy data set for exploration and remove ingredient/duration/direction_size outliers (to compute nicer pictures)
data_vis = exploration_util.remove_outliers(data_scores, ["duration", "direction_size", "totalFat [g]", "calories [cal]", "totalCarbohydrate [g]", "sugars [g]"])

NameError: name 'pd' is not defined

In [None]:
# Plot duration & direction_size distribution
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
sns.distplot(data_vis["duration"], ax=ax[0]).set_xlim(0, 600)
ax[1].xaxis.set_major_formatter(pltt.FuncFormatter(lambda x, _: int(x)))
sns.distplot(data_vis["direction_size"], ax=ax[1]).set_xlim(0, 20)

In [None]:
# Plot ingredient distribution
plt.rcParams.update({'font.size': 18})
fig, ax = plt.subplots(1, 4, figsize=(30, 10))
ax[0].set_ylabel('Density', fontsize = 20)
ax[0].set_xlabel('calories [cal]', fontsize = 20)
sns.distplot(data_vis["calories [cal]"].to_numpy(), ax=ax[0])
ax[1].set_ylabel('', fontsize = 0)
ax[1].set_xlabel('totalCarbohydrate [g]', fontsize = 20)
sns.distplot(data_vis["totalCarbohydrate [g]"].to_numpy(), ax=ax[1])
ax[2].set_ylabel('', fontsize = 0)
ax[2].set_xlabel('sugars [g]', fontsize = 20)
sns.distplot(data_vis["sugars [g]"].to_numpy(), ax=ax[2])
ax[3].set_ylabel('', fontsize = 0)
ax[3].set_xlabel('totalFat [g]', fontsize = 20)
sns.distplot(data_vis["totalFat [g]"].to_numpy(), ax=ax[3])

In [None]:
# Format data for heatmap
data_vis = data_vis.rename(columns={'servingSize [g]':'serSize', 'calories [cal]':'calories', 'caloriesFromFat [cal]':'calFromFat', 'totalFat [g]':'totalFat', 'cholesterol [mg]':'cholesterol', 'sodium [mg]':'sodium', 'dietaryFiber [g]':'dietaryFiber', 'sugars [g]':'sugars', 'protein [g]':'protein', 'saturatedFat [g]': 'satFat', 'totalCarbohydrate [g]': 'totalCarbs', 'direction_size': '#steps', 'ingredients_sizes': '#ingred', 'rating_y': 'rating'})
data_vis.drop(["new_recipe_id"], axis=1, inplace=True)
data_vis.drop(["servingsPerRecipe"], axis=1, inplace=True)
data_vis.drop(["new_member_id"], axis=1, inplace=True)
data_vis.drop(["new_author_id"], axis=1, inplace=True)

In [None]:
data_vis = data_vis[["duration","serSize","calories","calFromFat","totalFat","satFat","cholesterol","sodium","totalCarbs","dietaryFiber","sugars","protein","#steps","#ingred","rating"]]


In [None]:
# Plot heatmap of nutrients, direction_size, ingredient_size and ratings to show correlations
plt.rcParams.update({'font.size': 12})
plt.figure(figsize=(14, 10))
sns.heatmap(data_vis.corr(), annot=True, fmt='.2f')

-----------------------------------------------------------
# Simple example recommender
-----------------------------------------------------------
## Implicit Lib

In [190]:
pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl (748 kB)
   ---------------------------------------- 0.0/748.6 kB ? eta -:--:--
   - -------------------------------------- 20.5/748.6 kB ? eta -:--:--
   --- ----------------------------------- 71.7/748.6 kB 787.7 kB/s eta 0:00:01
   ---------------------------------------  747.5/748.6 kB 5.9 MB/s eta 0:00:01
   ---------------------------------------- 748.6/748.6 kB 5.2 MB/s eta 0:00:00
Installing collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [13]:
from recommendation import implicit_util
from scipy.sparse import csr_matrix
import implicit.evaluation
from tqdm.notebook import tqdm

In [14]:
# Build a sparse matrix (user x recipe x ratings)
user_recipe_matrix = csr_matrix((reviews_df['rating'], (reviews_df['new_member_id'], reviews_df['new_recipe_id'])))

# All users and recipes
users = list(user_dict.keys())
recipes = list(recipes_dict.keys())

# Test/train split #Alternatively use implicit.evaluation.leave_k_out_split to force each user being in both sets
train_matrix, test_matrix = implicit.evaluation.train_test_split(user_recipe_matrix.tocsr().tocoo())

# Get users/recipes in the train set (or test set respectively)
train_user, train_recipe = implicit_util.tuple_to_unique(train_matrix.tocsr().nonzero())
test_user, test_recipe = implicit_util.tuple_to_unique(test_matrix.tocsr().nonzero())

In [15]:
# Calculate the sparsity [in %]
A = user_recipe_matrix.toarray()
sparsity = 1.0 - (np.count_nonzero(A) / float(A.size) )
print(sparsity * 100)

99.89351155631748


In [None]:
# Executes all models, exception on Windows/Python3.10: nmslib_als, faiss_als
from tqdm import tqdm_notebook, tnrange
import nmslib

evaluation, recommendations, similar_items, similar_users = implicit_util.train_and_execute_all(train_matrix, test_matrix, train_user, train_recipe, ['bpr', 'faiss_als'], K=10)

## Irec Lib

In [77]:
%pip install cachetools

Collecting cachetools
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Downloading cachetools-5.3.3-py3-none-any.whl (9.3 kB)
Installing collected packages: cachetools
Successfully installed cachetools-5.3.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
import irec_util
from irec.environment.loader.full_data import FullData
from irec.recommendation.agents.simple_agent import SimpleAgent
from irec.recommendation.agents.action_selection_policies.egreedy import ASPEGreedy
from irec.recommendation.agents.value_functions.e_greedy import EGreedy
from irec.offline_experiments.evaluation_policies.fixed_interaction import FixedInteraction
from irec.offline_experiments.metric_evaluators.user_cumulative_interaction import UserCumulativeInteraction
from irec.recommendation.agents.action_selection_policies.greedy import ASPGreedy
from irec.recommendation.agents.value_functions.best_rated import BestRated
from irec.recommendation.agents.value_functions.entropy0 import Entropy0
from irec.recommendation.agents.value_functions.ictr import ICTRTS
from irec.recommendation.agents.value_functions.knn_bandit import kNNBandit
from irec.recommendation.agents.value_functions.log_pop_ent import LogPopEnt
from irec.recommendation.agents.value_functions.most_popular import MostPopular
from irec.recommendation.agents.value_functions.pts import PTS
from irec.recommendation.agents.value_functions.random import Random
from irec.recommendation.agents.value_functions.thompson_sampling import ThompsonSampling

In [184]:
# Build interaction matrix
pp_interactions = reviews_df[['new_member_id', 'new_recipe_id', 'rating', 'last_modified_date']]
pp_interactions = pp_interactions.rename(columns={'new_member_id': 'user_id', 'new_recipe_id': 'item_id', 'last_modified_date': 'timestamp'})

# Change timestamps and ratings to int
pp_interactions['timestamp'] = pp_interactions.timestamp.values.astype(np.int64)
pp_interactions['rating'] = pp_interactions['rating'].astype(int)


In [185]:
# Store preprocessed data
output_path = './data/irec/'
utils.ensure_dir(output_path)
pp_interactions.to_csv(output_path + 'foodData.csv', sep=',', index = False)

In [None]:
# Dataset
dataset = {
    'path': "./data/irec/foodData.csv",
    'random_seed': 0,
    'file_delimiter': ",",
    'skip_head': True
}

# Data Splitting
splitting = {
    'strategy': "global", # temporal, random, global, user_history
    'train_size': 0.8,
    'test_consumes': 5
}

# Loader
loader = FullData(dataset, splitting)
train_dataset, test_dataset, _, _ = loader.process()

In [180]:
# Define models
## Evaluation Policy
eval_policy = FixedInteraction(num_interactions=100, interaction_size=1, save_info=True)

## Agents: value function & selection policy
agents = []
agents.append(SimpleAgent(Random(),ASPGreedy(),name="Random"))
agents.append(SimpleAgent(EGreedy(), ASPEGreedy(epsilon=0.1), name="EGreedy"))
agents.append(SimpleAgent(Entropy0(),ASPGreedy(),name="Entropy0"))
agents.append(SimpleAgent(LogPopEnt(),ASPGreedy(),name="LogPopEnt"))
agents.append(SimpleAgent(MostPopular(),ASPGreedy(),name="MostPopular"))
agents.append(SimpleAgent(BestRated(),ASPGreedy(),name="BestRated"))
agents.append(SimpleAgent(ThompsonSampling(alpha_0=1,beta_0=100),ASPGreedy(),name="ThompsonSampling"))
agents.append(SimpleAgent(ICTRTS(num_lat=2,num_particles=5),ASPGreedy(),name="ICTRTS"))

## Agents which take long
agents.append(SimpleAgent(PTS(num_lat=20,num_particles=5,var=0.5,var_u=1.0,var_v=1.0),ASPGreedy(),name="PTS"))
agents.append(SimpleAgent(kNNBandit(alpha_0=1,beta_0=100,k=10),ASPGreedy(),name="kNNBandit"))

# Model runner
runner = irec_util.Runner(train_dataset, test_dataset, eval_policy, repetitions=1)

In [None]:
# Running models
results = runner.train_multiple_agents(agents)


In [None]:
# Evaluation
evaluator = UserCumulativeInteraction(
    ground_truth_dataset=test_dataset,
    num_interactions=20,
    interaction_size=1,
    interactions_to_evaluate=[10],
    relevance_evaluator_threshold=3.99
)

evaluations = irec_util.calc_multiple_scores(evaluator, results)

## HyperParameter Tuning (NEW CODE)

### Irec

---

**Random**

In [None]:
# Define models
## Evaluation Policy
import random
import time
eval_policy = FixedInteraction(num_interactions=100, interaction_size=1, save_info=True)

numEpochs = 20
#Hyperparameter(s) to tune
epsilonGrid = list(np.arange(0.01, 1.0, 0.01))

evaluator = UserCumulativeInteraction(
    ground_truth_dataset=test_dataset,
    num_interactions=20,
    interaction_size=1,
    interactions_to_evaluate=[10],
    relevance_evaluator_threshold=3.99
)

agents = []
for i in range(numEpochs):
## Agents: value function & selection policy
    random.seed(time.perf_counter())
    epsilon = random.randint(0, len(epsilonGrid)-1)
    epsilon = round(epsilonGrid[epsilon], 2)
    agents.append(SimpleAgent(EGreedy(), ASPEGreedy(epsilon=epsilon), name=str(epsilon)))

# Model runner
runner = irec_util.Runner(train_dataset, test_dataset, eval_policy, repetitions=1)
results = runner.train_multiple_agents(agents)

evaluations = irec_util.calc_multiple_scores(evaluator, results)
e = evaluations[0][['precision','recall','p&r']]

display(e.sort_values(by=['p&r']))

**ICTR**

In [None]:
# Define models
## Evaluation Policy
import random
import time
eval_policy = FixedInteraction(num_interactions=100, interaction_size=1, save_info=True)

#section off dataset to 100,000 entries
pp_interactions = pp_interactions[:100000]

# Store preprocessed data
output_path = './data/irec/'
utils.ensure_dir(output_path)
pp_interactions.to_csv(output_path + 'foodData.csv', sep=',', index = False)

# Dataset
dataset = {
    'path': "./data/irec/foodData.csv",
    'random_seed': 0,
    'file_delimiter': ",",
    'skip_head': True
}

# Data Splitting
splitting = {
    'strategy': "global", # temporal, random, global, user_history
    'train_size': 0.8,
    'test_consumes': 5
}

# Loader
loader = FullData(dataset, splitting)
train_dataset, test_dataset, _, _ = loader.process()

numEpochs = 20

#Hyperparameter(s) to tune
latGrid = list(np.arange(1, 10, 1))
particlesGrid = list(np.arange(1, 20, 1))

evaluator = UserCumulativeInteraction(
    ground_truth_dataset=test_dataset,
    num_interactions=20,
    interaction_size=1,
    interactions_to_evaluate=[10],
    relevance_evaluator_threshold=3.99
)

agents = []
for i in range(numEpochs):
## Agents: value function & selection policy
    random.seed(time.perf_counter())
    lat = random.randint(0, len(latGrid)-1)
    particle = random.randint(0, len(particlesGrid)-1)

    lat = round(latGrid[lat], 2)
    particle = round(particlesGrid[particle], 2)

    agents.append(SimpleAgent(ICTRTS(num_lat=lat,num_particles=particle),ASPGreedy(),name="(" + str(lat) + ", " + str(particle) + ")"))

# Model runner
runner = irec_util.Runner(train_dataset, test_dataset, eval_policy, repetitions=1)
results = runner.train_multiple_agents(agents)

evaluations = irec_util.calc_multiple_scores(evaluator, results)
e = evaluations[0][['precision','recall','p&r']]

display(e.sort_values(by=['p&r']))

### Implicit

---

**ALS**

In [9]:
# Build a sparse matrix (user x recipe x ratings)
user_recipe_matrix = csr_matrix((reviews_df['rating'], (reviews_df['new_member_id'], reviews_df['new_recipe_id'])))

# All users and recipes
users = list(user_dict.keys())
recipes = list(recipes_dict.keys())

# Test/train split #Alternatively use implicit.evaluation.leave_k_out_split to force each user being in both sets
train_matrix, test_matrix = implicit.evaluation.train_test_split(user_recipe_matrix.tocsr().tocoo())

# Get users/recipes in the train set (or test set respectively)
train_user, train_recipe = implicit_util.tuple_to_unique(train_matrix.tocsr().nonzero())
test_user, test_recipe = implicit_util.tuple_to_unique(test_matrix.tocsr().nonzero())

In [None]:
# Executes all models, exception on Windows/Python3.10: nmslib_als, faiss_als
from tqdm import tqdm_notebook, tnrange
import nmslib
import random
import time

evaluations = pd.DataFrame(columns=['parameters', 'precision', 'map', 'ndcg', 'total'])

numEpochs = 20
factorGrid = list(np.arange(1, 20, 1))
regularizationGrid = list(np.arange(0.01, 1.0, 0.01))

for i in range(numEpochs):
    random.seed(time.perf_counter())
    factors = random.randint(0, len(factorGrid)-1)
    regularization = random.randint(0, len(regularizationGrid)-1)

    factors = round(factorGrid[factors], 2)
    regularization = round(regularizationGrid[regularization], 2)

    evaluation, recommendations, similar_items, similar_users = implicit_util.train_and_execute_all(train_matrix, test_matrix, train_user, train_recipe, ['bpr', 'nmslib_als', 'lmf', 'bm25', 'cosine', 'tfidf', 'ii', 'annoy_als'], factors=factors, regularization = regularization, K=10)
    parameters = '(' + str(factors) + ', ' + str(regularization) + ')'
    precision = evaluation['p@10'].values[0]
    map = evaluation['map@10'].values[0]
    ndcg = evaluation['ndcg@10'].values[0]
    total = precision + map + ndcg
    evaluations.loc[evaluations.shape[0]] = [parameters, precision, map, ndcg, total]

display(evaluations.sort_values(by=['total'],ascending=False))

**LMF**

In [None]:
# Executes all models, exception on Windows/Python3.10: nmslib_als, faiss_als
from tqdm import tqdm_notebook, tnrange
import nmslib
import random
import time

evaluations = pd.DataFrame(columns=['parameters', 'precision', 'map', 'ndcg', 'total'])

numEpochs = 20
factorGrid = list(np.arange(20, 40, 1))
regularizationGrid = list(np.arange(0.5, 2.5, 0.1))

for i in range(numEpochs):
    random.seed(time.perf_counter())
    factors = random.randint(0, len(factorGrid)-1)
    regularization = random.randint(0, len(regularizationGrid)-1)

    factors = round(factorGrid[factors], 2)
    regularization = round(regularizationGrid[regularization], 2)

    evaluation, recommendations, similar_items, similar_users = implicit_util.train_and_execute_all(train_matrix, test_matrix, train_user, train_recipe, ['bpr', 'nmslib_als', 'als', 'bm25', 'cosine', 'tfidf', 'ii', 'annoy_als'], factors=factors, regularization = regularization, K=10)
    parameters = '(' + str(factors) + ', ' + str(regularization) + ')'
    precision = evaluation['p@10'].values[0]
    map = evaluation['map@10'].values[0]
    ndcg = evaluation['ndcg@10'].values[0]
    total = precision + map + ndcg
    evaluations.loc[evaluations.shape[0]] = [parameters, precision, map, ndcg, total]

display(evaluations.sort_values(by=['total'],ascending=False))

**bm25**

In [None]:
# Executes all models, exception on Windows/Python3.10: nmslib_als, faiss_als
from tqdm import tqdm_notebook, tnrange
import nmslib
import random
import time

evaluations = pd.DataFrame(columns=['parameters', 'precision', 'map', 'ndcg', 'total'])

numEpochs = 10
kGrid = list(np.arange(50, 150, 1))
bGrid = list(np.arange(0.01, 1, 0.01))

for i in range(numEpochs):
    random.seed(time.perf_counter())
    K1 = random.randint(0, len(kGrid)-1)
    B = random.randint(0, len(bGrid)-1)

    K1 = round(kGrid[K1], 2)
    B = round(bGrid[B], 2)

    evaluation, recommendations, similar_items, similar_users = implicit_util.train_and_execute_all(train_matrix, test_matrix, train_user, train_recipe, ['bpr', 'nmslib_als', 'als', 'lmf', 'cosine', 'tfidf', 'ii', 'annoy_als'], K1=K1, B=B, K=10)
    parameters = '(' + str(K1) + ', ' + str(B) + ')'
    precision = evaluation['p@10'].values[0]
    map = evaluation['map@10'].values[0]
    ndcg = evaluation['ndcg@10'].values[0]
    total = precision + map + ndcg
    evaluations.loc[evaluations.shape[0]] = [parameters, precision, map, ndcg, total]

display(evaluations.sort_values(by=['total'],ascending=False))

---

## MS Lib

In [None]:
pip install recommenders

In [None]:
import cornac_util
import torch
import cornac
from recommenders.utils.constants import SEED

In [None]:
# Build feature matrix
pp_interactions = reviews_df[['new_member_id', 'new_recipe_id', 'rating']]
pp_interactions = pp_interactions.rename(columns={'new_member_id': 'userID', 'new_recipe_id': 'itemID'})

In [None]:
# Store preprocessed data
output_path = './data/cornac/'
utils.ensure_dir(output_path)
pp_interactions.to_csv(output_path + 'foodData.csv', sep=',', index=False)

In [None]:
# Read & split data
pp_interactions, train, test, train_set = cornac_util.load_and_split()
print(pp_interactions.shape)
pp_interactions.head()

In [None]:
# Set global model parameters
## top k items to recommend
TOP_K = [10]

In [None]:
# BPR
bpr = cornac.models.BPR(
    k=10,  #200
    max_iter= 10,  #100,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [None]:
# BiVAE
bivae = cornac.models.BiVAECF(
    k=50,
    encoder_structure=[100],
    act_fn="tanh",
    likelihood="pois",
    n_epochs=10, #500
    batch_size=128,
    learning_rate=0.001,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

In [None]:
# Train
models = [bivae, bpr]
cornac_util.train_multiple(models, train_set)

In [None]:
# Prediction
model_predictions = cornac_util.predict_multiple(models, train)

In [None]:
#Evaluation
evaluation = cornac_util.calc_scores(test, model_predictions, TOP_K)

In [None]:
# Display results
display(evaluation[0])

In [None]:
utils.ensure_dir('./data/irec/')

In [None]:
display(evaluation)

## Additional Models (Surprise, NEW CODE)

In [16]:
%pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Build feature matrix
pp_interactions = reviews_df[['new_member_id', 'new_recipe_id', 'rating']]
pp_interactions = pp_interactions.rename(columns={'new_member_id': 'userID', 'new_recipe_id': 'itemID'})
pp_interactions.shape

(601887, 3)

In [11]:
#Source: https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-compute-precision-k-and-recall-k
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


In [12]:
from sklearn.metrics import ndcg_score
def ndcg(surprise_predictions, M=18316, N=30859, k=10):
    """
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, you could use sklearn.metrics.ndcg_score and scipy.sparse

    Parameters:
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions,
    see https://surprise.readthedocs.io/en/stable/predictions_module.html?highlight=prediction#surprise.prediction_algorithms.predictions.Prediction
    M: number of users
    N: number of movies
    k (positive integer): Only consider the highest k scores (items) in each user's recommendation list.

    Returns:
    float in [0., 1.]: The averaged NDCG scores over all users' recommendation lists.

    """
    # Task 3 to do start: return the ndcg score, with the help of sklearn ndcg_score package
    # hint: build 2 M x N matrices, each row representing the predicted/true ratings for one user -- and input that 2 matrices into ndcg_score directly.

    predicted_ratings = np.zeros((M, N))
    true_ratings = np.zeros((M,N))

    #Populate Matrices
    for prediction in surprise_predictions:
      user_id = int(prediction.uid)
      movie_id = int(prediction.iid)
      pred_rating = prediction.est
      true_rating = prediction.r_ui

      predicted_ratings[user_id-1][movie_id-1] = pred_rating
      true_ratings[user_id-1][movie_id-1] = true_rating    

    #array to track ndcg scores 
    ndcg_scores = []
    for user_id in range(M):
      #calculate ndcg score for given user (must convert ratings to numpy array to be fed into ndcg_score)
      pred = np.asarray([predicted_ratings[user_id]])
      truth = np.asarray([true_ratings[user_id]])
      ndcg_scores.append(ndcg_score(truth, pred, k=k))
      
    # Calculate average NDCG score across all users
    avg_ndcg_score = sum(ndcg_scores) / M

    return avg_ndcg_score

    # Task 3 to do end

In [None]:
from surprise import Dataset, Reader, BaselineOnly, KNNBasic, NMF, accuracy, SVD
from surprise.model_selection import train_test_split
from collections import defaultdict

metric_report = pd.DataFrame(index=['Bias', 'UserUser','ItemItem','NMF', 'SVD'], columns=['precision@10','recall@10','ndcg@10'])

# use the built-in funk svd at one parameter
# Task 6 to do start: fill in metric_report

algorithms = {
    'Bias': BaselineOnly(),
    'UserUser': KNNBasic(sim_options={'user_based': True}),
    'ItemItem': KNNBasic(sim_options={'user_based': False}),
    'NMF': NMF(),
    'SVD': SVD(n_factors=10, n_epochs=20),
}

reader = Reader(rating_scale=(1, 6))
for algorithm_name, algorithm in algorithms.items():

    data = Dataset.load_from_df(pp_interactions, reader)

    train, test = train_test_split(data,test_size=0.2)

    algorithm.fit(train)
    predictions = algorithm.test(test)
    precisions, recalls = precision_recall_at_k(predictions, k=10)
    ndcg_value = ndcg(predictions)

    # Calculate Metrics and Update metric_report
    metric_report.loc[algorithm_name, 'precision@10'] = sum(prec for prec in precisions.values()) / len(precisions)
    metric_report.loc[algorithm_name, 'recall@10'] = sum(rec for rec in recalls.values()) / len(recalls)
    metric_report.loc[algorithm_name, 'ndcg@10'] = ndcg_value

display(metric_report)
# Task 6 to do end