In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import itertools
from sklearn.decomposition import PCA

from sklearn.model_selection             import train_test_split
from recommendation_evaluation_functions import recommend_ingredients, held_out_recommendation, calculate_metrics

np.seterr(divide='ignore', invalid='ignore')
%matplotlib inline

In [2]:
# Load data.
X        = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X.index, inplace = True)

In [4]:
# Apply CF to original recipes.
recommendations = recommend_ingredients(X.head(1050), X.head(1050), similarity_measure = "jaccard")

In [5]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 10

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X.columns[np.where(X.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X.columns[np.where(X.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe2,]))

Recipe 16903 (mexican): 
Index(['cheddar_cheese', 'jalapeno_chilies', 'lettuce', 'lime', 'pork',
       'purple_onion', 'peppers', 'olive', 'cilantro_chopped_fresh',
       'pepper_ground_black', 'tortillas_corn'],
      dtype='object')

Recommended ingredients: 
Index(['tomatoes', 'garlic', 'cheese', 'tortilla', 'kosher_salt',
       'garlic_cloves', 'avocado', 'cilantro_chopped', 'onions', 'chile'],
      dtype='object')


Recipe 45887 (chinese): 
Index(['canola', 'garlic', 'mustard', 'scallions', 'sesame', 'sugar', 'wine',
       'starch_corn', 'crushed_pepper_flakes', 'ginger_fresh', 'beans_green',
       'sodium_sauce', 'pepper_white'],
      dtype='object')

Recommended ingredients: 
Index(['vinegar_rice', 'onions_green', 'rice_wine', 'ginger', 'dry_sherry',
       'pork', 'vegetable', 'sesame_seeds', 'mayonaise', 'juice_lemon'],
      dtype='object')




### Evaluation

**Split the data into train and test set**

In [6]:
X_train, X_test = train_test_split(X, test_size = 0.5, random_state = 1)

### Tune alpha parameter cosine similarity

In [12]:
alpha_grid = np.linspace(0.05,0.35,7)

results_alpha_PCA = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", 
                                            "median_rank"])

user_item_matrix  = X.head(1400)
# Fix number of neighbours to 50 and use PCA to evaluate (roughly) the optimal value of alpha
model_config = [50, "asymmetric_cosine", len(X.columns) - 1]

for alpha in alpha_grid:   
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config, usePCA = True,
                                                                   alpha = alpha)
    
    results_alpha_PCA = results_alpha_PCA.append(calculate_metrics(missing_ingredients, recommendations, 
                                                                   model_config), ignore_index = True)

In [13]:
results_alpha_PCA

Unnamed: 0,k,similarity_measure,top10_presence,mean_rank,median_rank
0,50,asymmetric_cosine,0.542857,27.2043,8


### Without PCA

In [None]:
results = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])

k_grid = [5, 10, 20, 50, 100]
#k_grid = [5]
similarity_measures = ["cosine"]#, "asymmetric_cosine", "jaccard", "pmi"]
n_recommendations = len(X.columns) - 1

user_item_matrix = X.head(1400)
#user_item_matrix = X_train

for k, sim in itertools.product(k_grid, similarity_measures):
    
    model_config = [k, sim, n_recommendations]
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)
    
    results = results.append(calculate_metrics(missing_ingredients, recommendations, model_config), 
                             ignore_index = True)

In [None]:
results

In [None]:
results.to_csv('results/parameter_settings_train.csv', sep = '\t')

### Same settings, with PCA

In [None]:
results_PCA = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])#

#k_grid = [5, 10, 20, 50, 100]
k_grid = [50]
similarity_measures = ["cosine"]
n_recommendations = len(X.columns) - 1
user_item_matrix = X.head(1400)

for k, sim in itertools.product(k_grid, similarity_measures):
    
    model_config = [k, sim, n_recommendations]
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config, usePCA = True)
    
    results_PCA = results_PCA.append(calculate_metrics(missing_ingredients, recommendations, model_config), 
                             ignore_index = True)

In [None]:
results_PCA

In [None]:
results_PCA.to_csv('results/parameter_settings_train_PCA.csv', sep = '\t')