In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import ast
import pickle
from surprise import Dataset, Reader, KNNWithMeans, KNNWithZScore, SVD, SVDpp, SlopeOne, NMF, CoClustering, KNNBaseline
from surprise.model_selection import GridSearchCV, cross_validate

In [30]:
recipes = pd.read_csv("Data/Recipes.csv")
user_interactions = pd.read_csv("Data/Interactions.csv")
recipe_ratings = pd.read_csv("Data/Recipe_Bayesian_Ratings.csv")

In [31]:
recipes.head()

Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
0,arriba baked winter squash mexican style,1,55,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,2,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,3,130,"['time-to-make', 'course', 'preparation', 'mai...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,4,45,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,5,190,"['weeknight', 'time-to-make', 'course', 'main-...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [32]:
recipes.describe()

Unnamed: 0,id,minutes,n_steps,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
count,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0,231637.0
mean,115819.0,9398.546,9.765499,9.051153,473.942425,36.0807,84.296865,30.147485,34.68186,45.58915,15.560403
std,66867.98649,4461963.0,5.995128,3.734796,1189.711374,77.79884,800.080897,131.961589,58.47248,98.235758,81.82456
min,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,57910.0,20.0,6.0,6.0,174.4,8.0,9.0,5.0,7.0,7.0,4.0
50%,115819.0,40.0,9.0,9.0,313.4,20.0,25.0,14.0,18.0,23.0,9.0
75%,173728.0,65.0,12.0,11.0,519.7,41.0,68.0,33.0,51.0,52.0,16.0
max,231637.0,2147484000.0,145.0,43.0,434360.2,17183.0,362729.0,29338.0,6552.0,10395.0,36098.0


In [33]:
unique_tags = sorted(list(set([tag for tag_list in recipes['tags'] for tag in ast.literal_eval(tag_list)])))

In [34]:
print(len(unique_tags))
print(unique_tags)

552
['', '1-day-or-more', '15-minutes-or-less', '3-steps-or-less', '30-minutes-or-less', '4-hours-or-less', '5-ingredients-or-less', '60-minutes-or-less', 'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.', 'a1-sauce', 'african', 'american', 'amish-mennonite', 'angolan', 'appetizers', 'apples', 'april-fools-day', 'argentine', 'artichoke', 'asian', 'asparagus', 'australian', 'austrian', 'avocado', 'bacon', 'baja', 'baked-beans', 'baking', 'bananas', 'bar-cookies', 'barbecue', 'bass', 'bean-soup', 'beans', 'beans-side-dishes', 'bear', 'beef', 'beef-barley-soup', 'beef-crock-pot', 'beef-kidney', 'beef-liver', 'beef-organ-meats', 'beef-ribs', 'beef-sauces', 'beef-sausage', 'beginner-cook', 'beijing', 'belgian', 'berries', 'beverages', 'birthday', 'biscotti', 'bisques-cream-soups', 'black-bean-soup', 'black-beans', 'blueberries', 'bok-choys', 'brazilian', 'bread-machine', 'bread-pudding', 'breads', 'breakfast', 'breakfast-casseroles', 'breakfast-eggs', 'breakfast-potato

In [36]:
recipes = pd.merge(recipes, recipe_ratings, how='left', left_on='id', right_on='id', suffixes=(False, False))

In [37]:
recipes.head()

Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),bayesian_avg
0,arriba baked winter squash mexican style,1,55,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,4.637702
1,a bit different breakfast pizza,2,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,4.176164
2,all in the kitchen chili,3,130,"['time-to-make', 'course', 'preparation', 'mai...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0,
3,alouette potatoes,4,45,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0,4.480921
4,amish tomato ketchup for canning,5,190,"['weeknight', 'time-to-make', 'course', 'main-...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0,4.57183


In [43]:
DVP_HIGH = 40.0
DVP_MED = 25.0
DVP_LOW = 10.0

def parse_pdv(dvp, multiplier):
    low = 0.0
    high = float("inf")
    match dvp:
        case "high":
            low = DVP_HIGH * multiplier
        case "med":
            high = DVP_HIGH * multiplier
            low = DVP_LOW * multiplier
        case "low":
            high = DVP_MED * multiplier
    return low, high

def getRecipesWithConfiguration(calories=None, daily=2000, fat="NULL", sat_fat="NULL", sugar="NULL", sodium="NULL", protein="NULL", carbs="NULL", tags=[]):
    
    high_calorie_lim = float("inf")
    low_calorie_lim = 0

    multiplier = 2000 / daily

    if calories != None:
        high_calorie_lim = max(calories+100, calories * 1.2)
        low_calorie_lim = min(calories-100, calories * 0.8)        
    
    low_fat_lim, high_fat_lim = parse_pdv(fat, multiplier)
    low_sat_fat_lim, high_sat_fat_lim = parse_pdv(sat_fat, multiplier)
    low_sugar_lim, high_sugar_lim = parse_pdv(sugar, multiplier)
    low_sodium_lim, high_sodium_lim = parse_pdv(sodium, multiplier)
    low_protein_lim, high_protein_lim = parse_pdv(protein, multiplier)
    low_carbs_lim, high_carbs_lim = parse_pdv(carbs, multiplier)

    recipes_filter = ((low_calorie_lim <= recipes['calories']) & (recipes['calories'] <= high_calorie_lim) &
                      (low_fat_lim <= recipes['total fat (PDV)']) & (recipes['total fat (PDV)'] <= high_fat_lim) &
                      (low_sat_fat_lim <= recipes['saturated fat (PDV)']) & (recipes['saturated fat (PDV)'] <= high_sat_fat_lim) &
                      (low_sugar_lim <= recipes['sugar (PDV)']) & (recipes['sugar (PDV)'] <= high_sugar_lim) &
                      (low_sodium_lim <= recipes['sodium (PDV)']) & (recipes['sodium (PDV)'] <= high_sodium_lim) &
                      (low_protein_lim <= recipes['protein (PDV)']) & (recipes['protein (PDV)'] <= high_protein_lim) &
                      (low_carbs_lim <= recipes['carbohydrates (PDV)']) & (recipes['carbohydrates (PDV)'] <= high_carbs_lim))

    tags_filter = pd.Series(True, index=recipes.index)

    if tags:
        for tag in tags:
            if type(tag) != str:
                continue
            tags_filter = tags_filter & recipes['tags'].str.contains(tag, case=False)

        recipes_filter = recipes_filter & tags_filter
    
    recipes_found = recipes[recipes_filter]

    recipes_found_sorted = recipes_found.sort_values(by='bayesian_avg', ascending=False)

    return recipes_found_sorted

In [45]:
results = getRecipesWithConfiguration(calories=1000, fat="mid", sugar="low", sodium="mid", protein="high", carbs="low", tags=['vegan'])
print(len(results))
results[:10]

8


Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),bayesian_avg
222493,veggie lunch meat vegan dad,222494,145,"['time-to-make', 'course', 'main-ingredient', ...",17,"['method', 'get water steaming in your steamer...",from vegan dad blog -- excellent for sack lunc...,"['white beans', 'water', 'oil', 'salt', 'papri...",13,834.4,86.0,6.0,209.0,43.0,37.0,22.0,4.637702
109758,huh huh huh huhmmos,109759,17,"['30-minutes-or-less', 'time-to-make', 'course...",14,"['combine the beans , tahini , 2 tablespoons o...","yep, another one. this time i replaced regular...","['garbanzo beans', 'tahini', 'olive oil', 'gar...",9,937.8,103.0,4.0,31.0,41.0,45.0,23.0,4.607511
213956,toasted nori with sesame seeds,213957,20,"['30-minutes-or-less', 'time-to-make', 'course...",8,['toast the nori by passing each sheet repeate...,you can use these just like gomashio. mix it w...,"['nori', 'white sesame seeds', 'sea salt', 're...",4,836.6,111.0,3.0,388.0,51.0,50.0,11.0,4.57183
214013,toasted sunflower seeds,214014,13,"['15-minutes-or-less', 'time-to-make', 'course...",5,"['preheat oven to 325 degrees', 'mix sunflower...",a great little snack for everyone.,"['sunflower seeds', 'oil', 'salt']",3,857.4,117.0,14.0,24.0,58.0,34.0,9.0,4.57183
220351,valencia peanut and flax seed butter,220352,25,"['30-minutes-or-less', 'time-to-make', 'course...",9,['roast the peanuts either on the stove top or...,a trader joes' copycat recipe. their version i...,"['peanuts', 'flax seed', 'salt', 'oil', 'sugar']",5,1052.1,137.0,25.0,1.0,90.0,57.0,11.0,4.57183
45917,chicken style seitan,45918,120,"['time-to-make', 'course', 'main-ingredient', ...",20,"['preheat oven to 350f', 'line a 8 inch square...",this recipe was derived from a larger food net...,"['canola oil', 'onion', 'garlic', 'gluten flou...",10,892.3,53.0,7.0,94.0,203.0,13.0,17.0,4.524177
100077,grilled tempeh steak,100078,30,"['30-minutes-or-less', 'time-to-make', 'course...",8,"['score tempeh on both sides , making shallow ...",by marinating the tempeh the day before you pl...,"['tempeh', 'marinade', 'rice vinegar', 'soy sa...",12,814.7,78.0,2.0,76.0,119.0,46.0,14.0,4.480921
153101,parmesan style cheese vegan,153102,5,"['weeknight', '15-minutes-or-less', 'time-to-m...",3,['place all ingredients in food processor and ...,a great recipe from melody pretttyman. this ta...,"['blanched almond', 'nutritional yeast flakes'...",5,953.7,107.0,24.0,34.0,102.0,36.0,18.0,


In [40]:
results = getRecipesWithConfiguration(calories=600, protein="high", fat="mid", tags=['breakfast', '15-minutes-or-less'])
print(len(results))
results.head()

136


Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),bayesian_avg
2126,a yummy twist on sausage rolls,2127,15,"['15-minutes-or-less', 'time-to-make', 'course...",11,['unroll crescent rolls and pinch together to ...,i got this recipe from a friend of mine whose ...,"['sausage meat', 'crescent rolls', 'prepared m...",4,519.1,51.0,9.0,42.0,45.0,54.0,9.0,4.607511
2273,abs diet super food oatmeal,2274,7,"['15-minutes-or-less', 'time-to-make', 'course...",2,['mix the first seven ingeredients and microwa...,this recipe is from the abs diet,"['oatmeal', 'milk', 'egg', 'raspberries', 'alm...",9,613.6,36.0,49.0,9.0,53.0,41.0,25.0,4.607511
2512,adam eve sandwich,2513,15,"['ham', '15-minutes-or-less', 'time-to-make', ...",6,['toast english muffins and butter immediately...,"my family likes these for breakfast, lunch or ...","['english muffins', 'ham', 'hard-boiled eggs',...",7,639.0,67.0,12.0,34.0,66.0,129.0,9.0,4.518133
6554,apple cinnamon oats for surelim,6555,15,"['15-minutes-or-less', 'time-to-make', 'course...",1,['mix all ingredients and cook on a low heat s...,i was doing the sureslim diet and needed to ma...,"['rolled oats', 'milk', 'granny smith apple', ...",5,539.7,22.0,62.0,5.0,42.0,33.0,28.0,4.57183
8484,ari s eggs and onions and salmon and cream cheese,8485,7,"['15-minutes-or-less', 'time-to-make', 'course...",13,"['break eggs into mixing bowl', 'add cream che...",i wanted to make my son a quick delicious brea...,"['eggs', 'butter', 'onion', 'smoked salmon', '...",6,648.5,84.0,12.0,32.0,62.0,146.0,2.0,4.57183


In [14]:
user_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,1,225878,4
1,2,225878,5
2,3,71600,4
3,4,14907,5
4,5,14907,5


In [15]:
user_interactions.describe()

Unnamed: 0,user_id,recipe_id,rating
count,966111.0,966111.0,966111.0
mean,9391.486762,114691.657311,4.529013
std,13530.586557,66961.314904,1.058641
min,1.0,1.0,0.0
25%,610.0,58218.0,4.0
50%,2908.0,114819.0,5.0
75%,12233.0,171831.0,5.0
max,60314.0,231635.0,5.0


In [12]:
interactions_reader = Reader(line_format="user item rating", sep=',', rating_scale=(0, 5), skip_lines=1)
interactions_data = Dataset.load_from_file("Data/Interactions.csv", interactions_reader)

In [13]:
interactions_trainset = interactions_data.build_full_trainset()

In [33]:
svd_sim_options = {
    "n_factors": [20, 50, 100, 200, 500],
    "n_epochs": [10, 20, 40, 100],
    "biased": [True, False],
    "lr_all": [0.0005, 0.005, 0.05, 0.5],
    "reg_all": [0.002, 0.02, 0.2]
}

In [35]:
svd_gs = GridSearchCV(SVD, svd_sim_options, measures=["rmse", "mae"], cv=5, n_jobs=-1, joblib_verbose=1)

In [36]:
svd_gs.fit(interactions_data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 191.8min finished


In [38]:
print(svd_gs.best_score)

{'rmse': 1.0088828446129101, 'mae': 0.47107160727416597}


In [39]:
print(svd_gs.best_params)

{'rmse': {'n_factors': 20, 'n_epochs': 20, 'biased': True, 'lr_all': 0.005, 'reg_all': 0.2}, 'mae': {'n_factors': 500, 'n_epochs': 20, 'biased': True, 'lr_all': 0.5, 'reg_all': 0.002}}


In [61]:
cross_validate(CoClustering(), interactions_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1277  1.1262  1.1288  1.1315  1.1286  1.1286  0.0017  
MAE (testset)     0.6320  0.6306  0.6296  0.6328  0.6322  0.6314  0.0012  
Fit time          28.20   27.80   27.87   28.21   28.70   28.16   0.32    
Test time         2.08    1.98    1.98    2.07    0.77    1.78    0.51    


{'test_rmse': array([1.12772181, 1.12616034, 1.12875994, 1.13152073, 1.12862757]),
 'test_mae': array([0.63196315, 0.63060332, 0.62961786, 0.63284676, 0.63216222]),
 'fit_time': (28.19971752166748,
  27.797827005386353,
  27.869712352752686,
  28.20687246322632,
  28.701749324798584),
 'test_time': (2.0848894119262695,
  1.979933500289917,
  1.9849200248718262,
  2.071107864379883,
  0.770427942276001)}

In [58]:
cross_validate(NMF(), interactions_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2128  1.2100  1.2060  1.2079  1.2089  1.2091  0.0023  
MAE (testset)     0.8190  0.8198  0.8213  0.8177  0.8195  0.8195  0.0012  
Fit time          31.07   31.54   31.69   30.76   31.28   31.27   0.33    
Test time         1.81    1.90    1.74    0.73    0.71    1.38    0.54    


{'test_rmse': array([1.21278714, 1.21002853, 1.20597892, 1.20788336, 1.20886785]),
 'test_mae': array([0.81901635, 0.81980505, 0.82130037, 0.81768266, 0.81952913]),
 'fit_time': (31.069397926330566,
  31.544697523117065,
  31.691863536834717,
  30.758241176605225,
  31.28497004508972),
 'test_time': (1.80690336227417,
  1.9009323120117188,
  1.7432210445404053,
  0.7293896675109863,
  0.7133259773254395)}

In [40]:
svdpp = SVDpp()

In [42]:
svdpp.fit(interactions_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x29613c9bf50>

In [45]:
cross_validate(svdpp, interactions_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0242  1.0215  1.0203  1.0170  1.0239  1.0214  0.0026  
MAE (testset)     0.6017  0.5998  0.6007  0.5978  0.5997  0.5999  0.0013  
Fit time          538.21  531.07  533.73  533.60  542.21  535.76  3.96    
Test time         63.88   62.46   63.48   62.01   61.01   62.57   1.03    


{'test_rmse': array([1.02420445, 1.02152263, 1.02027472, 1.01698547, 1.02386378]),
 'test_mae': array([0.60167718, 0.59975385, 0.60069562, 0.59784337, 0.5996951 ]),
 'fit_time': (538.2060167789459,
  531.0698781013489,
  533.7280797958374,
  533.6034462451935,
  542.2100763320923),
 'test_time': (63.88419532775879,
  62.46167182922363,
  63.479918479919434,
  62.01494216918945,
  61.00875973701477)}

In [48]:
svd = SVD(n_factors=20, n_epochs=20, biased=True, lr_all=0.005, reg_all=0.2)

In [49]:
cross_validate(svd, interactions_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0111  1.0100  1.0094  1.0015  1.0139  1.0092  0.0041  
MAE (testset)     0.6042  0.6051  0.6038  0.6016  0.6052  0.6040  0.0013  
Fit time          6.19    6.67    6.97    6.74    6.71    6.66    0.26    
Test time         1.49    0.86    1.57    0.86    1.53    1.26    0.33    


{'test_rmse': array([1.01110033, 1.00997953, 1.00937236, 1.00152769, 1.01391235]),
 'test_mae': array([0.60421846, 0.60513131, 0.60377468, 0.60158843, 0.60522776]),
 'fit_time': (6.1899402141571045,
  6.674145698547363,
  6.972915887832642,
  6.736252546310425,
  6.710755109786987),
 'test_time': (1.4884147644042969,
  0.8562014102935791,
  1.565445899963379,
  0.8630378246307373,
  1.5334908962249756)}

In [47]:
cross_validate(svd, interactions_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1574  1.1550  1.1638  1.1640  1.1514  1.1583  0.0049  
MAE (testset)     0.4695  0.4706  0.4731  0.4733  0.4690  0.4711  0.0018  
Fit time          20.87   21.52   21.02   21.53   21.54   21.30   0.29    
Test time         1.53    1.59    0.89    1.57    0.93    1.30    0.32    


{'test_rmse': array([1.15737772, 1.15499432, 1.16376819, 1.16398416, 1.15137381]),
 'test_mae': array([0.46950155, 0.47058683, 0.47309676, 0.47328702, 0.4689518 ]),
 'fit_time': (20.86619520187378,
  21.524942636489868,
  21.015730142593384,
  21.532662630081177,
  21.54482936859131),
 'test_time': (1.5342495441436768,
  1.5948662757873535,
  0.8851385116577148,
  1.5733850002288818,
  0.9324884414672852)}