In [1]:
import pandas as pd

from collections import defaultdict

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt

import ast


In [2]:

df_made_train = pd.read_csv("datasets/recipes_made_train.csv")
df_made_test = pd.read_csv("datasets/recipes_made_test.csv")
df_made_val = pd.read_csv("datasets/recipes_made_val.csv")
df_raw_interactions = pd.read_csv("datasets/RAW_interactions.csv")
df_raw_recipes = pd.read_csv("datasets/RAW_recipes.csv")
df_pp_users = pd.read_csv("datasets/PP_users.csv")
df_pp_recipes = pd.read_csv("datasets/PP_recipes.csv")
df_train = pd.read_csv("datasets/interactions_train.csv")
df_test = pd.read_csv("datasets/interactions_test.csv")
df_val = pd.read_csv("datasets/interactions_validation.csv")
df_ingr_map = pd.read_csv("datasets/ingr_map.csv")

In [3]:
# Cleaning raw recipe to remove recipes that take longer than 1 day, have more than 25 steps, and more than 40 ingredients
df_rr_clean = df_raw_recipes[df_raw_recipes['minutes'] <= 1440]
df_rr_clean = df_rr_clean[df_rr_clean['n_ingredients'] <= 25]
df_rr_clean = df_rr_clean[df_rr_clean['n_steps'] <= 40]

# Dropping date, u, and i columns
df_train_clean = df_train.drop(["date", "u", "i"], axis=1)
df_test_clean = df_test.drop(["date", "u", "i"], axis=1)
df_val_clean = df_val.drop(["date", "u", "i"], axis=1)

# Removing any recipes that were removed from cleaning the raw recipes
df_train_clean = df_train_clean[df_train_clean['recipe_id'].isin(df_rr_clean['id'])]
df_test_clean = df_test_clean[df_test_clean['recipe_id'].isin(df_rr_clean['id'])]
df_val_clean = df_val_clean[df_val_clean['recipe_id'].isin(df_rr_clean['id'])]


In [4]:

df_train_sub = df_train_clean.iloc[:5000, :]

In [5]:


# Create utility matrix
utility_matrix = df_train_sub.pivot(index='user_id', columns='recipe_id', values='rating').fillna(0)

# Perform Singular Value Decomposition
U, sigma, Vt = svds(utility_matrix.to_numpy(), k=50)  # k is the number of latent features
sigma = np.diag(sigma)

# Compute user and recipe latent feature matrices
user_features = np.dot(U, sigma)  # User latent feature matrix
recipe_features_cf = np.dot(sigma, Vt).T  # Recipe latent feature matrix (from CF)


In [6]:
# Normalize content-based features (time as an example numerical feature)
scaler = StandardScaler()
df_raw_recipes['min_normalized'] = scaler.fit_transform(df_raw_recipes[['minutes']])
df_raw_recipes['n_steps_normalized'] = scaler.fit_transform(df_raw_recipes[['n_steps']])
df_raw_recipes['n_ingr_normalized'] = scaler.fit_transform(df_raw_recipes[['n_ingredients']])

# Convert nutrition strings into a list
df_raw_recipes['nutrition'] = df_raw_recipes['nutrition'].apply(ast.literal_eval)
    

# Merge recipe IDs for consistent indexing
recipe_features_cb = df_raw_recipes[['id', 'minutes', 'nutrition', 'n_steps', 'n_ingredients']]



In [7]:
nutr_dict = defaultdict(list)
for nutr_list in recipe_features_cb['nutrition']:
    nutr_dict['calories'].append(nutr_list[0])
    nutr_dict['total_fat'].append(nutr_list[1])
    nutr_dict['sugar'].append(nutr_list[2])
    nutr_dict['sodium'].append(nutr_list[3])
    nutr_dict['protein'].append(nutr_list[4])
    nutr_dict['saturated_fat'].append(nutr_list[5])
    nutr_dict['carbohydrates'].append(nutr_list[6])
nutr_df = pd.DataFrame(nutr_dict)
recipe_features_cb = pd.concat([recipe_features_cb, nutr_df], axis=1)
recipe_features_cb.drop(['nutrition'], axis=1, inplace=True)


In [8]:
recipe_features_cb.head()

Unnamed: 0,id,minutes,n_steps,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,137739,55,11,7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,30,9,6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,130,6,13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,45,11,11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,190,5,8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [None]:
# Align CF features with the recipe IDs
cf_features_df = pd.DataFrame(recipe_features_cf, index=utility_matrix.columns, columns=[f'CF_Feature_{i}' for i in range(recipe_features_cf.shape[1])])
cf_features_df.reset_index(inplace=True)
cf_features_df.rename(columns={'index': 'recipe_id'}, inplace=True)

recipe_features_cb.rename({'id': 'recipe_id'}, axis='columns', inplace=True)

# Merge CF and CBF features
hybrid_features = pd.merge(cf_features_df, recipe_features_cb, on='recipe_id', how='inner')
hybrid_features.set_index('recipe_id', inplace=True)

# Create a conversion map from recipe_id to id
hyb_feat_conv = hybrid_features.reset_index()


           CF_Feature_0  CF_Feature_1  CF_Feature_2  CF_Feature_3  \
recipe_id                                                           
46         1.779792e-30  6.131336e-16 -8.441475e-16  1.249282e-15   
62         2.171877e-03 -5.185313e-02 -1.905056e-02 -5.470460e-01   
100        7.756011e-02  1.602059e-02 -2.977601e-02  1.072312e-02   
112        2.487751e-01  8.035991e-01  1.052973e-02  3.473677e-01   
113        1.541912e-03 -3.692103e-02 -1.357407e-02 -3.900430e-01   

           CF_Feature_4  CF_Feature_5  CF_Feature_6  CF_Feature_7  \
recipe_id                                                           
46        -2.311741e-15 -4.124056e-17  5.798848e-17  9.367710e-16   
62        -3.869874e-03  2.292644e-01 -2.823017e-01 -4.065041e-01   
100       -1.902614e-02 -3.821645e-02 -2.961517e-02 -4.309252e-02   
112       -5.228309e-01  1.036287e+00  9.880758e-01 -7.387129e-01   
113       -2.764679e-03  1.638532e-01 -2.019751e-01 -2.912345e-01   

           CF_Feature_8  CF_Feat

In [None]:

similarity_matrix = cosine_similarity(hybrid_features)

def CF_CBF_Hybrid(user_id, num_recommendations=5):
    # Get the user's rated recipes and ratings
    if user_id not in utility_matrix.index:
        return []

    user_rated_recipes = utility_matrix.loc[user_id]
    rated_indices = []
    for recipe_id, rating in user_rated_recipes.items():
        if rating >= 0: rated_indices.append(recipe_id)
    
    if len(rated_indices) == 0:
        print("The user has not rated any recipes!")
        return []
    # Extract the sub-matrix for rated recipes
    rated_similarity_matrix = similarity_matrix[:, hyb_feat_conv.loc[hyb_feat_conv['recipe_id'] == rated_indices].index[0]]
    # Get the user's ratings for the rated recipes
    user_ratings = [user_rated_recipes[int(i)] for i in rated_indices]
    
    # Compute the weighted sum of similarities for all recipes
    recipe_scores = np.dot(rated_similarity_matrix, user_ratings)

    # Rank recipes by score and exclude already-rated recipes
    recommended_recipes = pd.DataFrame({
        'recipeID': hybrid_features.index,
        'score': recipe_scores
    }).sort_values(by='score', ascending=False)
    
    recommended_recipes = recommended_recipes[~recommended_recipes['recipeID'].isin(user_rated_recipes.index)]
    
    return recommended_recipes.head(num_recommendations)



In [None]:
correct = 0
for _, data in df_made_test.iterrows():
    pred = 0
    u = data['user_id']
    r = data['recipe_id']
    if r in CF_CBF_Hybrid(u):
        pred = 1
    made = int(data['made'])
    if(pred == made): correct+=1

In [71]:
correct/len(df_made_test)

0.5