In [None]:
import pandas as pd
import numpy as np

import re

from sklearn.preprocessing import StandardScaler
import math
import scipy.stats as st
from scipy.stats import bayes_mvs
from scipy.sparse import csr_matrix

from time import time
import copy
import pickle
import missingno as msno

import itertools
from collections import Counter
from collections import deque 

In [None]:
!ls

<h4><b>What is the Bayesian average?</b></h4>
The Bayesian average uses two constants to offset the arithmetic average of an individual product. This is important as otherwise products with only one five star rating are affoarding the same quality ranking as products with thousands of five star reviews. To account for differences in certainty, the rating for recipes with less than a critical number of reviews are adjusted, whlile recipes above this threshold are only very slightly adjusted. The critical value (C) is the number of reviews of the 25% quartile, which for our dataset is 1. The formula for the Bayesian average (r bar) is shown below where r and c are the rating and rating count for an individual recipe and C and R are critical threshold and average rating.

<h3></h3>
<h3>Importing data</h3>

In [None]:
def import_data():
    #cleaned recipes
    recipes = pd.read_csv('PP_recipes.csv')
    del recipes['i']
    del recipes['name_tokens']
    del recipes['ingredient_tokens']
    del recipes['steps_tokens']
    recipes = recipes.set_index('id')

    #ratings
    ratings = pd.read_csv('RAW_interactions.csv')
    del ratings['user_id']
    del ratings['date']
    del ratings['review']
    ratings = ratings.set_index('recipe_id')

    #raw recipe info
    raw_recipes = pd.read_csv('RAW_recipes.csv')
    del raw_recipes['contributor_id']
    del raw_recipes['submitted']
    del raw_recipes['tags']
    del raw_recipes['steps']
    del raw_recipes['description']
    raw_recipes = raw_recipes.set_index('id')

    return (recipes, raw_recipes, ratings)

In [None]:
recipes_, raw_recipes_, ratings_ = import_data()

In [None]:
ratings

In [None]:
msno.matrix(ratings)

In [None]:
raw_recipes

In [None]:
msno.matrix(raw_recipes);

In [None]:
recipes

In [None]:
msno.matrix(recipes)

<h3>Ratings</h3>

In [None]:
#checking for outliers
print(f'Count of ratings > 5: {(ratings > 5).sum()[0]}')
print(f'Count of ratings < 0: {(ratings < 0).sum()[0]}')

In [None]:
#two methods for re-scaling reviews based on review number 

#bayesian ci
def bayes_ci(n,rating,alpha=0.95):
    mean,_,_ = bayes_mvs(n,alpha)
    return mean.statistic

#bayesian average
avg = ratings.mean()
count = ratings.count()
C = (ratings.reset_index().groupby('recipe_id').agg('size').sort_values().quantile(.25))
m = ratings.mean().rating

def bayes_avg(n):
    avg = n.mean()
    count = n.count()
    return (avg*count+C*m)/(count+C)

In [None]:
ratings_ = copy.copy(ratings)

In [None]:
q = ratings.quantile(0.95)
C = ratings.mean()

def weighted_rating(s, m=q, c=C):
    R = s.mean()
    v = s.count()
    return ((v/(v+m) * R) + (m/(m+v) * C))[0]

In [None]:
#scale ratings
def scale_ratings(ratings):
    ratings = ratings.reset_index()
    ratings = ratings.groupby('recipe_id').agg({'rating':bayes_avg})
    #ratings = ratings.reset_index()
    #df = df.drop_duplicates(subset='recipe_id')
    return ratings #df.set_index('recipe_id')

<h3>Creating reference dataset</h3>

In [None]:
df_ref = raw_recipes[['name','ingredients']]
df_ref

<h4>Encoding and normalizing features</h4>

In [None]:
def scale_column(df,key):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(np.array(df[key]).reshape(-1,1)),columns=[key])

In [None]:
cols_to_normalize = ['calorie_level','minutes','n_steps','n_ingredients','rating']

def normalize_select_cols(df,cols_to_normalize):
    for col in cols_to_normalize:
        df[col] = scale_column(df,col).values
    return df

<h3>Creating clustering dataset</h3>

In [None]:
#function to time fits
def timer_func(func):
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [None]:
#merging data into single usable df
def merge_frames(recipes,raw_recipes, ratings):
    return recipes.merge(raw_recipes[['minutes','nutrition','n_steps','n_ingredients']],how='inner',left_index=True,right_index=True).merge(ratings,how='inner',left_index=True,right_index=True)

In [None]:
#convert string that looks lke a list of numbers to a list of 
def string_to_list(s):
    l = re.findall(r'\d+',s)
    l = [int(num) for num in l]
    return l

In [None]:
#get list of top ingredients 
def get_top(recipes=recipes,n_ingred=1000,show=False):
    if isinstance(recipes['ingredient_ids'].iloc[0],str):
        ingred = recipes['ingredient_ids'].apply(string_to_list)
    else: ingred = recipes['ingredient_ids']
    ingred = list(itertools.chain.from_iterable(ingred))
    total_num_ingred = len(Counter(ingred))
    if show:
        print(f'Total ingredient number: {total_num_ingred}')
    ingred_counts = Counter(ingred).most_common(n_ingred)
    top_ingredients = [ingred_counts[n][0] for n in range(len(ingred_counts))]
    return total_num_ingred, top_ingredients

In [None]:
total_num_ingred,_ = get_top(recipes_,show=True)
top_ingredients

In [None]:
#filtering ingredients that aren't in the top ingredient list
def trim_ingredients(recipes,size=None,top_ingredients=top_ingredients):
    if not size:
        size = recipes['ingredient_ids'].shape[0]
        #print(size)

    ingredient_list = deque([])

    for n in range(size): #df[1].shape[0]):
        test=string_to_list(recipes['ingredient_ids'].iloc[n])
        selected = [i for i in test if i in top_ingredients]
        ingredient_list.append(selected)
    recipes = recipes.iloc[:size,:]
    recipes['ingredient_ids']=ingredient_list
    return recipes

In [None]:
#convert one list column to encoded columns
def transform_to_sparse(df,key,size,encoded=False,sparse=True):
    if isinstance(df[key].iloc[0],str):
        data = df[key].apply(string_to_list)
    else: data = df[key]
    t_names = [str(n)+' '+key for n in range(size)]
    data = pd.DataFrame(data.to_list(), columns=t_names,index=df.index.values)
    if encoded:
        return data 
    else: 
        data = pd.get_dummies(data.stack(),sparse=sparse)
        data = data.groupby(level=0).sum()
        #data = data.fillna(0)
    return data

In [None]:
#convert all keys to encoded columns
def transform_all_to_sparse(df,keys,sizes,encoded=False,sparse=True):
    #encode
    for n,key in enumerate(keys):
        data = transform_to_sparse(df,key,sizes[n],encoded[n],sparse[n])
        yield data 
        #delete original columns
        try:
            df.pop(key)
        except:
            pass

In [None]:
#combine all encoded columns into one matricies with other data
def concat_df(g,df):
    df_concat = pd.concat(g,axis=1)
    df = pd.concat([df_concat,df],axis=1)
    
    return df

<h3>Data cleaning pipeline</h3>

In [None]:
#class to do all 
@timer_func
class CleanAndTransform():
    def __init__(self,n_ingred=1000,test_size=None):
        #data attributes
        self.keys  = ['techniques','ingredient_ids','nutrition']
        self.sizes = (58,19,14)
        self.encoded = (True,False,False)
        self.sparse = (False, False, False)
        self.cols_to_normalize = ['calorie_level','minutes','n_steps','n_ingredients','rating']
        
        #contorl output dimensions
        self.n_ingred = n_ingred
        self.test_size = test_size
        
        #clean data
        self.recipes, self.raw_recipes, self.ratings = import_data()   
        _,self.top_ingredients = get_top(recipes=self.recipes,n_ingred=n_ingred,show=False)
        self.recipes = trim_ingredients(recipes=self.recipes,size=self.test_size,top_ingredients=self.top_ingredients)
        self.ratings = scale_ratings(self.ratings)
        self.df = merge_frames(self.recipes, self.raw_recipes, self.ratings)
        self.df = normalize_select_cols(self.df,self.cols_to_normalize)
        self.df = self.df.fillna(0)


    def __call__(self):
        #encode data
        self.df = self.df[:self.test_size]
        self.g = transform_all_to_sparse(self.df,self.keys,self.sizes,self.encoded,self.sparse)
        df = concat_df(self.g,self.df)
        return df

In [None]:
transform = CleanAndTransform(test_size=100)
df = transform()

In [None]:
with open('cleaned_data.pkl','wb') as f:
    pickle.dump(df,f)

In [None]:
cleaned.shape

In [None]:
df.info()

In [None]:
@timer_func
def to_sparse(df):
    return csr_matrix(df.values)

to_sparse(df_full)

In [None]:
df_full.info()

<h2> Dimensionality Reduction </h2>

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
df

In [None]:
pca = PCA(n_components = 0.95,random_state=10)
df.columns = df.columns.astype(str)
X_pca = pca.fit_transform(df)
print('PCA reduced dimensions from ', df.shape[1],' to ',X_pca.shape[1] ,' and preserved 95% of variance.')

In [None]:
plt.bar(range(pca.n_components_), pca.explained_variance_ratio_,color='mediumseagreen')
plt.xlabel('Principle component')
plt.ylabel('Explained variance ratio')
plt.xticks(range(pca.n_components_));