In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# https://surprise.readthedocs.io/en/stable/getting_started.html

In [2]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate

In [3]:
reviews = pd.read_csv('../../Fed_up/data/preprocessed/review_pp.csv')
recipes = pd.read_csv('../../Fed_up/data/preprocessed/recipe_pp.csv')

In [4]:
reviews.head()

Unnamed: 0,recipe_id,user_id,rating,liked,review,date
0,197160,136726,5,1,I used this mix to make meat balls.Very simple...,2006-11-25
1,79222,56680,5,1,"Oh, This was wonderful! Had a soup and salad ...",2006-11-11
2,20930,6258,5,1,"Jan, what an interesting combination of flavor...",2002-07-09
3,20930,102602,5,1,"Jan, we love your pasta salad, it's delicious....",2003-10-26
4,79222,183565,5,1,Wow! My family loves this recipe and it is a ...,2006-02-13


# Rating based

In [5]:
reader = Reader(rating_scale=(1, 5))

In [6]:
rel_data = reviews[['user_id', 'recipe_id', 'rating']]
model_data = Dataset.load_from_df(rel_data, reader)

In [7]:
train, test = train_test_split(model_data, test_size=.3)
algorithm = SVD()
algorithm.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11894c310>

In [8]:
predictions = algorithm.test(test)
accuracy.rmse(predictions)

RMSE: 0.6030


0.602975833949082

In [9]:
# cv_results = cross_validate(algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# cv_results['test_rmse'].mean(), cv_results['test_mae'].mean()

In [11]:
recipe_list = rel_data[rel_data['user_id'] != 235291].recipe_id.unique()
predictions = [algorithm.predict(235291, rec) for rec in recipe_list]
pdf = pd.DataFrame(predictions, columns = ['user_id', 'recipe_id', 'rating', 'rec_rating', 'details']) \
        .drop(columns=['rating', 'details'])
pdf.sort_values('rec_rating', ascending=False, inplace=True)
pdf.reset_index(inplace=True)
pdf['rec_score'] = (pdf['rec_rating'] - 1) / (5 - 1)
pdf.shape

(88881, 5)

In [12]:
rel_data[rel_data['user_id'] == 235291].merge(recipes, on="recipe_id", how="left")[['recipe_id', 'name', 'rating']]

Unnamed: 0,recipe_id,name,rating
0,16741,aunt bee s lentil soup,5
1,10744,delicious chicken pot pie,5
2,16575,just the best barbecue sauce ever,3
3,22782,jo mama s world famous spaghetti,5
4,107997,4 minute spicy garlic shrimp,5
5,212500,mushroom noodle soup,5
6,208930,mary s chiles rellenos,5
7,79563,uncle bill s green split pea with hambone soup,5


In [13]:
final_table = pdf.merge(recipes, on="recipe_id", how="left")[['recipe_id', 'name', 'rec_rating', 'rec_score']]
final_table.head(10)

Unnamed: 0,recipe_id,name,rec_rating,rec_score
0,458273,grilled skewered beef,5.0,1.0
1,78072,rutmus,5.0,1.0
2,397795,nif s 1 ww pt spaghetti sauce,5.0,1.0
3,54517,cucumber sandwich filling,5.0,1.0
4,135149,idaho fries,5.0,1.0
5,365501,connoisseurs casserole,5.0,1.0
6,249974,penne with spicy vodka tomato cream sauce,5.0,1.0
7,217554,potato wedges with chili cream,5.0,1.0
8,320460,chicken breasts with fresh tarragon dijon mus...,5.0,1.0
9,455166,sweet soy glazed chicken wings,5.0,1.0


# Liked based

In [14]:
reader = Reader(rating_scale=(0, 1))

In [15]:
rel_data = reviews[['user_id', 'recipe_id', 'liked']]
model_data = Dataset.load_from_df(rel_data, reader)

In [16]:
train, test = train_test_split(model_data, test_size=.3)
algorithm = SVD()
algorithm.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12e1cc310>

In [17]:
predictions = algorithm.test(test)
accuracy.rmse(predictions)

RMSE: 0.2228


0.22275273494413306

In [18]:
# cv_results = cross_validate(algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# cv_results['test_rmse'].mean(), cv_results['test_mae'].mean()

In [19]:
recipe_list = rel_data[rel_data['user_id'] != 235291].recipe_id.unique()
predictions = [algorithm.predict(235291, rec) for rec in recipe_list]
pdf = pd.DataFrame(predictions, columns = ['user_id', 'recipe_id', 'liked', 'rec_liked', 'details']) \
        .drop(columns=['liked', 'details'])
pdf.sort_values('rec_liked', ascending=False, inplace=True)
pdf.reset_index(inplace=True)
pdf['rec_score'] = pdf['rec_liked'] / 1
pdf.shape

(88881, 5)

In [20]:
final_table = pdf.merge(recipes, on="recipe_id", how="left")[['recipe_id', 'name', 'rec_liked', 'rec_score']]
final_table.head(10)

Unnamed: 0,recipe_id,name,rec_liked,rec_score
0,389212,hazelnut scones w optional lemon honey or liq...,1.0,1.0
1,141668,quick wild rice casserole,1.0,1.0
2,38128,basil walnut cheese spread,1.0,1.0
3,149381,chargrilled oysters,1.0,1.0
4,115446,fire and rice,1.0,1.0
5,101747,banana raspberry bread,1.0,1.0
6,145011,fig bread,1.0,1.0
7,11725,corn puffs,1.0,1.0
8,396191,self crusting mushroom and spinach quiche,1.0,1.0
9,130741,grilled pork tenderloin with gingered bourbon ...,1.0,1.0


# Testing algorithms

In [1]:
import pandas as pd
import time
from Fed_up.surprise import Surprise

In [8]:
algos = ['NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNWithMeans', 
         'KNNWithZScore', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering']

In [9]:
analysis = pd.DataFrame(columns=['algo', 'time', 'rmse', 'mae'])

for algo in algos:
    start = time.time()
    model = Surprise()
    rmse, mae = model.train(algo=algo, test='cv', local=True)
    end = time.time()
    new_row = {'algo': algo, 'time': end - start, 'rmse': rmse, 'mae': mae}
    analysis = analysis.append(new_row, ignore_index=True)
    
analysis

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8120  0.8055  0.8113  0.8078  0.8053  0.8084  0.0028  
MAE (testset)     0.5597  0.5584  0.5610  0.5596  0.5576  0.5593  0.0012  
Fit time          0.48    0.81    0.61    0.63    0.64    0.63    0.11    
Test time         0.78    0.77    0.76    0.80    0.77    0.78    0.01    
0.808 0.559
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8096  0.8062  0.8053  0.8104  0.8119  0.8087  0.0025  
MAE (testset)     0.5595  0.5559  0.5582  0.5602  0.5626  0.5593  0.0022  
Fit time          0.48    0.65    0.65    0.65    0.65    0.62    0.07    
Test time         0.79    0.78    0.53    0.55    0.76    0.68    0.12    
0.809 0.559
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fo

Unnamed: 0,algo,time,rmse,mae
0,NormalPredictor,16.395426,0.808,0.559
1,BaselineOnly,16.226702,0.809,0.559
2,KNNBasic,16.442944,0.81,0.562
3,KNNWithMeans,15.869862,0.809,0.56
4,KNNWithZScore,16.649998,0.808,0.559
5,KNNBaseline,17.199382,0.808,0.56
6,SVD,16.180317,0.809,0.56
7,SVDpp,16.616012,0.807,0.558
8,NMF,16.305501,0.809,0.56
9,SlopeOne,16.405803,0.808,0.56


In [7]:
analysis_l = pd.DataFrame(columns=['algo', 'time', 'rmse', 'mae'])

for algo in algos:
    start = time.time()
    model = Surprise()
    rmse, mae = model.train(algo=algo, like=True, test='cv', local=True)
    end = time.time()
    new_row = {'algo': algo, 'time': end - start, 'rmse': rmse, 'mae': mae}
    analysis_l = analysis_l.append(new_row, ignore_index=True)
    
analysis_l

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2732  0.2731  0.2742  0.2735  0.2738  0.2736  0.0004  
MAE (testset)     0.1562  0.1558  0.1569  0.1558  0.1564  0.1562  0.0004  
Fit time          0.47    0.66    0.65    0.66    0.65    0.62    0.08    
Test time         0.69    0.53    0.69    0.54    0.69    0.63    0.08    
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2744  0.2738  0.2743  0.2753  0.2749  0.2745  0.0005  
MAE (testset)     0.1574  0.1577  0.1574  0.1575  0.1570  0.1574  0.0002  
Fit time          0.46    0.63    0.66    0.65    0.65    0.61    0.08    
Test time         0.69    0.70    0.68    0.84    0.69    0.72    0.06    
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fo

Unnamed: 0,algo,time,rmse,mae
0,NormalPredictor,15.731543,0.274,0.156
1,BaselineOnly,15.824387,0.275,0.157
2,KNNBasic,15.6874,0.274,0.156
3,KNNWithMeans,16.348062,0.274,0.157
4,KNNWithZScore,16.032035,0.274,0.157
5,KNNBaseline,16.061927,0.274,0.157
6,SVD,16.016701,0.274,0.157
7,SVDpp,16.012346,0.274,0.157
8,NMF,16.268278,0.274,0.156
9,SlopeOne,16.042635,0.275,0.157


# Hypertuning SVD

In [5]:
best_model = Surprise()

In [6]:
rmse, mae = best_model.train(algo='SVD', like=True, test='svd_grid', local=True)

{'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02} {'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02}


In [7]:
rmse, mae

(0.22394805228515025, 0.10278555909110025)

# Predicting

In [2]:
model = Surprise()
rmse, mae = model.train(like=True, test='cv', local=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2741  0.2744  0.2747  0.2754  0.2737  0.2745  0.0006  
MAE (testset)     0.1571  0.1565  0.1574  0.1574  0.1569  0.1571  0.0003  
Fit time          0.73    0.90    0.86    0.97    0.67    0.83    0.11    
Test time         1.15    1.20    1.08    1.86    0.80    1.22    0.35    


In [3]:
rmse, mae

(0.274, 0.157)

In [4]:
output = model.predict(235291)

Unnamed: 0,recipe_id,name,liked
0,16741,aunt bee s lentil soup,1
1,10744,delicious chicken pot pie,1
2,16575,just the best barbecue sauce ever,0
3,22782,jo mama s world famous spaghetti,1
4,107997,4 minute spicy garlic shrimp,1
5,212500,mushroom noodle soup,1
6,208930,mary s chiles rellenos,1
7,79563,uncle bill s green split pea with hambone soup,1


Unnamed: 0,recipe_id,name,rec_liked,rec_score
0,141820,yummy yummy macaroni and cheese,1.0,1.0
1,203538,hsun tan or smoked eggs,1.0,1.0
2,356146,asiago dip,1.0,1.0
3,33400,hungarian cucumber salad with sour cream dressing,1.0,1.0
4,79489,philly style andouille po boys,1.0,1.0
5,137260,chicken green beans alfredo,1.0,1.0
6,80672,crispy potato galette,1.0,1.0
7,178472,tofu steak,1.0,1.0
8,32834,mini meatloafs,1.0,1.0
9,488968,syrian manoushi bread,1.0,1.0


In [5]:
model2 = Surprise()
rmse, mae = model2.train(like=False, test='cv', local=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8098  0.8077  0.8075  0.8062  0.8078  0.8078  0.0012  
MAE (testset)     0.5587  0.5587  0.5587  0.5578  0.5596  0.5587  0.0006  
Fit time          0.50    0.65    0.77    0.69    0.67    0.66    0.09    
Test time         0.83    0.78    0.56    0.82    0.87    0.77    0.11    


In [6]:
rmse, mae

(0.808, 0.559)

In [7]:
output = model2.predict(235291)

Unnamed: 0,recipe_id,name,rating
0,16741,aunt bee s lentil soup,5
1,10744,delicious chicken pot pie,5
2,16575,just the best barbecue sauce ever,3
3,22782,jo mama s world famous spaghetti,5
4,107997,4 minute spicy garlic shrimp,5
5,212500,mushroom noodle soup,5
6,208930,mary s chiles rellenos,5
7,79563,uncle bill s green split pea with hambone soup,5


Unnamed: 0,recipe_id,name,rec_rating,rec_score
0,258045,creamy chicken and ravioli with bacon,5.0,1.0
1,315752,pork and bean salad,5.0,1.0
2,336249,sweet potato casserole for thanksgiving,5.0,1.0
3,147829,cherry tomato meatballs,5.0,1.0
4,21039,soft chicken,5.0,1.0
5,54121,kasha with vegetables,5.0,1.0
6,170301,wheat sweet salad,5.0,1.0
7,97931,moms simple swiss steak,5.0,1.0
8,417625,frans s prizewinning bobotie,5.0,1.0
9,89969,cuban style picadillo,5.0,1.0
