In [5]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import itertools
from sklearn.decomposition import PCA

from sklearn.model_selection             import train_test_split
from recommendation_evaluation_functions import recommend_ingredients, held_out_recommendation, calculate_metrics

%matplotlib inline

In [6]:
# Load data.
X        = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X.index, inplace = True)

## Evaluating a sample of recommendations

In [7]:
# Apply CF to original recipes.
recommendations = recommend_ingredients(X.head(50), X.head(50), similarity_measure = "jaccard", k = 5)

Some ingredients were not present in any recipe.


In [8]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 10

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X.columns[np.where(X.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[5][recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X.columns[np.where(X.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[5][recipe2,]))

Recipe 16903 (mexican): 
Index(['cheddar_cheese', 'jalapeno_chilies', 'lettuce', 'lime', 'pork',
       'purple_onion', 'peppers', 'olive', 'cilantro_chopped_fresh',
       'pepper_ground_black', 'tortillas_corn'],
      dtype='object')

Recommended ingredients: 
Index(['cilantro_chopped', 'beans', 'olives_black', 'vinegar_white',
       'mushrooms', 'garlic', 'tomatoes', 'chicken_broth', 'sauce_fish',
       'pepper_green'],
      dtype='object')


Recipe 45887 (chinese): 
Index(['canola', 'garlic', 'mustard', 'scallions', 'sesame', 'sugar', 'wine',
       'starch_corn', 'crushed_pepper_flakes', 'ginger_fresh', 'beans_green',
       'sodium_sauce', 'pepper_white'],
      dtype='object')

Recommended ingredients: 
Index(['vegetable', 'pepper', 'paste_curry', 'sesame_seeds', 'olive',
       'carrots', 'ground_pork', 'pepper_green_bell', 'olives', 'capers'],
      dtype='object')




## Model evaluation

**Split the data into train and test set**

In [10]:
X_train, X_test = train_test_split(X, test_size = 0.9, random_state = 1)

### Tune alpha parameter cosine similarity

In [13]:
# Initialise dataframe of results
results_alpha_PCA = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", 
                                            "median_rank"])

alpha_grid        = np.linspace(0.05,0.35,7)
user_item_matrix  = X_train

# Fix number of neighbours to 50 and use PCA to evaluate (roughly) the optimal value of alpha
model_config = [50, "asymmetric_cosine", len(X.columns)]

for alpha in alpha_grid:   
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config, usePCA = True,
                                                                   alpha = alpha)  
    for k, recs in recommendations.items():
        results_alpha_PCA = results_alpha_PCA.append(calculate_metrics(missing_ingredients, recs, 
                                                                       model_config[0],  
                                                                       model_config[1]),ignore_index = True)

In [14]:
results_alpha_PCA

Unnamed: 0,k,similarity_measure,top10_presence,mean_rank,median_rank
0,[50],asymmetric_cosine,0.446438,34.5629,12
1,[50],asymmetric_cosine,0.448581,33.9783,12
2,[50],asymmetric_cosine,0.448313,33.7159,13
3,[50],asymmetric_cosine,0.441885,33.5102,13
4,[50],asymmetric_cosine,0.439743,33.3313,13
5,[50],asymmetric_cosine,0.442421,33.3803,13
6,[50],asymmetric_cosine,0.439743,33.2134,13


In [15]:
results_alpha_PCA.to_csv('results/parameter_settings_train_alpha.csv', sep = '\t')

### Without PCA

In [16]:
results = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])

k_grid              = [5, 10, 20, 50, 100]
similarity_measures = ["cosine", "asymmetric_cosine", "jaccard", "pmi"]
n_recommendations   = len(X.columns)

for sim in similarity_measures:
    
    model_config = [k_grid, sim, n_recommendations]
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)
    
    for k, recs in recommendations.items():
        results = results.append(calculate_metrics(missing_ingredients, recs, k, sim), 
                                 ignore_index = True)    

In [17]:
results

Unnamed: 0,k,similarity_measure,top10_presence,mean_rank,median_rank
0,5,cosine,0.255222,71.8642,39.0
1,10,cosine,0.289502,61.4191,30.0
2,20,cosine,0.316015,52.7769,26.0
3,50,cosine,0.314676,45.5766,23.0
4,100,cosine,0.287627,44.9799,25.0
5,5,asymmetric_cosine,0.254687,88.5402,77.0
6,10,asymmetric_cosine,0.280129,72.7287,32.0
7,20,asymmetric_cosine,0.301018,60.0378,27.0
8,50,asymmetric_cosine,0.316015,49.7852,25.0
9,100,asymmetric_cosine,0.28602,46.6403,25.0


In [18]:
results.to_csv('results/parameter_settings_train.csv', sep = '\t')

### Same settings, with PCA

In [19]:
results_PCA = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])

for sim in similarity_measures:
    
    model_config = [k_grid, sim, n_recommendations]
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config, usePCA = True)
    
    for k, recs in recommendations.items():
        results_PCA = results_PCA.append(calculate_metrics(missing_ingredients, recs, k, sim), 
                            ignore_index = True)

In [20]:
results_PCA

Unnamed: 0,k,similarity_measure,top10_presence,mean_rank,median_rank
0,5,cosine,0.33744,68.3093,25.0
1,10,cosine,0.380557,54.1326,18.0
2,20,cosine,0.408945,41.6553,15.0
3,50,cosine,0.43278,33.2903,13.0
4,100,cosine,0.427424,32.0056,13.0
5,5,asymmetric_cosine,0.340118,77.6915,40.0
6,10,asymmetric_cosine,0.386181,59.0196,18.0
7,20,asymmetric_cosine,0.4218,44.2616,14.0
8,50,asymmetric_cosine,0.441885,33.5102,13.0
9,100,asymmetric_cosine,0.441618,31.545,12.0


In [21]:
results_PCA.to_csv('results/parameter_settings_train_PCA.csv', sep = '\t')