In [235]:
import pandas as pd

import re

import math
import scipy.stats as st

from time import time
import copy
import pickle

<h4><b>What is the Bayesian average?</b></h4>
The Bayesian average uses two constants to offset the arithmetic average of an individual product. This is important as otherwise products with only one five star rating are affoarding the same quality ranking as products with thousands of five star reviews. To account for differences in certainty, the rating for recipes with less than a critical number of reviews are adjusted, whlile recipes above this threshold are only very slightly adjusted. The critical value (C) is the number of reviews of the 25% quartile, which for our dataset is 1. The formula for the Bayesian average (r bar) is shown below where r and c are the rating and rating count for an individual recipe and C and R are critical threshold and average rating.

<h3></h3>
<h3>Importing data</h3>

In [236]:
def import_data():
    #cleaned recipes
    recipes = pd.read_csv('PP_recipes.csv')
    del recipes['i']
    del recipes['name_tokens']
    del recipes['ingredient_tokens']
    del recipes['steps_tokens']
    recipes = recipes.set_index('id')

    #ratings
    ratings = pd.read_csv('RAW_interactions.csv')
    del ratings['user_id']
    del ratings['date']
    del ratings['review']
    ratings = ratings.set_index('recipe_id')

    #raw recipe info
    raw_recipes = pd.read_csv('RAW_Recipes.csv')
    del raw_recipes['contributor_id']
    del raw_recipes['submitted']
    del raw_recipes['tags']
    del raw_recipes['steps']
    del raw_recipes['description']
    raw_recipes = raw_recipes.set_index('id')

    return (recipes, raw_recipes, ratings)

In [237]:
recipes, raw_recipes, ratings = import_data()

In [238]:
ratings

Unnamed: 0_level_0,rating
recipe_id,Unnamed: 1_level_1
40893,4
40893,5
44394,4
85009,5
85009,5
...,...
72730,0
386618,5
78003,5
78003,4


In [239]:
raw_recipes

Unnamed: 0_level_0,name,minutes,nutrition,n_steps,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
137739,arriba baked winter squash mexican style,55,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['winter squash', 'mexican seasoning', 'mixed ...",7
31490,a bit different breakfast pizza,30,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['prepared pizza crust', 'sausage patty', 'egg...",6
112140,all in the kitchen chili,130,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['ground beef', 'yellow onions', 'diced tomato...",13
59389,alouette potatoes,45,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,"['spreadable cheese with garlic and herbs', 'n...",11
44061,amish tomato ketchup for canning,190,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"['tomato juice', 'apple cider vinegar', 'sugar...",8
...,...,...,...,...,...,...
486161,zydeco soup,60,"[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['celery', 'onion', 'green sweet pepper', 'gar...",22
493372,zydeco spice mix,5,"[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,"['paprika', 'salt', 'garlic powder', 'onion po...",13
308080,zydeco ya ya deviled eggs,40,"[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['hard-cooked eggs', 'mayonnaise', 'dijon must...",8
298512,cookies by design cookies on a stick,29,"[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,"['butter', 'eagle brand condensed milk', 'ligh...",10


In [240]:
recipes

Unnamed: 0_level_0,techniques,calorie_level,ingredient_ids
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
424415,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
146223,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
312329,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
74301,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
76272,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"
...,...,...,...
323143,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[840, 208, 2499, 2683, 1925, 335, 1511]"
149114,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[2499, 4717, 1168, 6270, 6324, 7040]"
34200,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",2,"[2378, 7655, 3219, 2320, 5168, 5319, 4189, 268..."
30618,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[5627, 2807, 5412, 3399, 7979, 1093, 1257, 780..."


<h3>Ratings</h3>

In [241]:
#checking for outliers
print(f'Count of ratings > 5: {(ratings > 5).sum()[0]}')
print(f'Count of ratings < 0: {(ratings < 0).sum()[0]}')

Count of ratings > 5: 0
Count of ratings < 0: 0


In [242]:
#two methods for re-scaling reviews based on review number 

#bayesian ci
from scipy.stats import bayes_mvs
mean, _, _, = bayes_mvs(test, alpha=.95)
mean.statistic

def bayes_ci(n,rating,alpha=0.95):
    mean,_,_ = bayes_mvs(n,alpha)
    return mean.statistic

#bayesian average
avg = ratings.mean()
count = ratings.count()
C = (ratings.reset_index().groupby('recipe_id').agg('size').sort_values().quantile(.25))
m = ratings.mean().rating

def bayes_avg(n):
    avg = n.mean()
    count = n.count()
    return (avg*count+C*m)/(count+C)

In [243]:
ratings_ = copy.copy(ratings)

In [244]:
q = ratings.quantile(0.95)
C = ratings.mean()

def weighted_rating(s, m=q, c=C):
    R = s.mean()
    v = s.count()
    return ((v/(v+m) * R) + (m/(m+v) * C))[0]

In [245]:
#scale ratings
def scale_ratings(ratings):
    ratings = ratings.reset_index()
    ratings = ratings.groupby('recipe_id').agg({'rating':bayes_avg})
    #ratings = ratings.reset_index()
    #df = df.drop_duplicates(subset='recipe_id')
    return ratings #df.set_index('recipe_id')

<h3>Creating reference dataset</h3>

In [246]:
df_ref = raw_recipes[['name','ingredients']]
df_ref

Unnamed: 0_level_0,name,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
137739,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ..."
31490,a bit different breakfast pizza,"['prepared pizza crust', 'sausage patty', 'egg..."
112140,all in the kitchen chili,"['ground beef', 'yellow onions', 'diced tomato..."
59389,alouette potatoes,"['spreadable cheese with garlic and herbs', 'n..."
44061,amish tomato ketchup for canning,"['tomato juice', 'apple cider vinegar', 'sugar..."
...,...,...
486161,zydeco soup,"['celery', 'onion', 'green sweet pepper', 'gar..."
493372,zydeco spice mix,"['paprika', 'salt', 'garlic powder', 'onion po..."
308080,zydeco ya ya deviled eggs,"['hard-cooked eggs', 'mayonnaise', 'dijon must..."
298512,cookies by design cookies on a stick,"['butter', 'eagle brand condensed milk', 'ligh..."


<h3>Creating clustering dataset</h3>

In [247]:
#function to time fits
def timer_func(func):
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [248]:
#merging data into single usable df
def merge_frames(raw_recipes, ratings):
    return recipes.merge(raw_recipes[['minutes','nutrition','n_steps','n_ingredients']],how='inner',left_index=True,right_index=True).merge(ratings,how='inner',left_index=True,right_index=True)

In [249]:
#convert string that looks lke a list of numbers to a list of 
def string_to_list(s):
    l = re.findall(r'\d+',s)
    l = [int(num) for num in l]
    return l

In [250]:
#convert one list column to encoded columns
def transform_to_sparse(df,key,size,encoded=False):
    df[key]=df[key].apply(string_to_list)
    t_names = [str(n)+' '+key for n in range(size)]
    data = pd.DataFrame(df[key].to_list(), columns=t_names,index=df.index.values)
    if encoded:
        return data 
    else: 
        data = pd.get_dummies(data.stack(),sparse=True)
        data = data.groupby(level=0).sum()
    return data

In [251]:
#convert all keys to encoded columns
def transform_all_to_sparse(df,keys,sizes,encoded):
    #encode
    for n,key in enumerate(keys):
        data = transform_to_sparse(df,key,sizes[n],encoded[n])
        yield data 
        #delete original columns
        try:
            df.pop(key)
        except:
            pass

In [252]:
#combine all encoded columns into one matricies with other data
def concat_df(g,df):
    df_concat = pd.concat(g,axis=1)
    df = pd.concat([df_concat,df],axis=1)

    with open('concat_df.pkl','wb') as f:
        pickle.dump(df,f)
    
    return df


In [253]:
#class to do all 
class CleanAndTransform():
    def __init__(self,keys=['default'],sizes=(100),encoded=(False)):
        self.recipes, self.raw_recipes, self.ratings = import_data()   
        self.ratings = scale_ratings(self.ratings)
        self.df = merge_frames(self.raw_recipes, self.ratings)
        self.sizes = sizes
        self.encoded = encoded
        self.keys = keys

    def __call__(self,length=None):
        self.df = self.df[:length]
        self.g = transform_all_to_sparse(self.df,self.keys,self.sizes,self.encoded)
        df = concat_df(self.g,self.df)
        return df

In [254]:
keys  = ['techniques','ingredient_ids','nutrition']
sizes = (58,20,14)
encoded = (True,False,False)

transformer = CleanAndTransform(keys,sizes,encoded)
df = transformer(length=100)
df

Unnamed: 0,0 techniques,1 techniques,2 techniques,3 techniques,4 techniques,5 techniques,6 techniques,7 techniques,8 techniques,9 techniques,...,2271,2356,3177,3732,4060,calorie_level,minutes,n_steps,n_ingredients,rating
424415,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,61,6,5,4.654403
146223,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,55,10,12,4.691117
312329,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,25,6,15,4.335057
74301,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,15,8,8,4.753931
76272,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,20,5,4,4.649438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190651,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,120,8,8,4.438776
338407,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,40,14,7,4.617680
454323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,12,13,4,4.519865
160330,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,28,8,4,4.335057


In [255]:
df

Unnamed: 0,0 techniques,1 techniques,2 techniques,3 techniques,4 techniques,5 techniques,6 techniques,7 techniques,8 techniques,9 techniques,...,2271,2356,3177,3732,4060,calorie_level,minutes,n_steps,n_ingredients,rating
424415,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,61,6,5,4.654403
146223,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,55,10,12,4.691117
312329,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,25,6,15,4.335057
74301,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,15,8,8,4.753931
76272,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,20,5,4,4.649438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190651,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,120,8,8,4.438776
338407,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,40,14,7,4.617680
454323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,12,13,4,4.519865
160330,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,28,8,4,4.335057


In [256]:
#check pickled df 
'''
# with open('concat_df.pkl','rb') as f:
    y = pickle.load(f)
y
'''

"\n# with open('concat_df.pkl','rb') as f:\n    y = pickle.load(f)\ny\n"