In [15]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import statistics
import itertools

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection  import train_test_split
from similarity_functions     import asymmetric_cosine, jaccard 

np.seterr(divide='ignore', invalid='ignore')
%matplotlib inline

In [16]:
# Load data.
X = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X.index, inplace = True)

In [17]:
def recommend_ingredients(partial_recipes, user_item_matrix, k = 10, similarity_measure = "cosine", 
                          similarity_matrix = None, n_recommendations = 10, alpha = 0.2):
    """Recommend ingredients to (partial) recipes based on the similarity between ingredients.
    
    Inputs:
        - partial_recipes:    pandas dataframe of recipes that ingredient recommendations are produced for. Should be
                              of the shape recipes x ingredients.
                           
        - user_item_matrix:   pandas dataframe of training recipes. Should be of the shape recipes x ingredients.
        
        - k:                  number of neighbours (ingredients) used when calculating the ingredient 
                              recommendation scores.
        
        - similarity_measure: the measure used for calculating the similarity between ingredients. One of
                              'cosine', 'asymmetric_cosine', 'jaccard', 'pmi'.
                              
        - similarity_matrix:  the precomputed matrix of ingredient similarities. If not given, this will be
                              computed by the function.
                              
        - n_recommendations:  the desired number of recommended ingredients per recipe.
        
    Outputs a matrix of the recommended ingredients (columns) for the given partial recipes (rows).
        
    """
    
    # Calculate the similarity matrix if none was given as input.
    if np.all(similarity_matrix == None):
        
        if similarity_measure == "cosine":
            similarity_matrix = cosine_similarity(user_item_matrix.T)
            
        elif similarity_measure == "asymmetric_cosine":
            similarity_matrix = asymmetric_cosine(user_item_matrix, alpha)
            
        elif similarity_measure == "jaccard":
            similarity_matrix = jaccard(user_item_matrix)
            
        elif similarity_measure == "pmi":
            pass
        
        else: 
            raise ValueError("The similarity measure must be one of: 'cosine', 'asymmetric_cosine', 'jaccard', 'pmi'.")
    
    # Set similarity to self to zero.
    np.fill_diagonal(similarity_matrix, 0)     
    
    # For each ingredient, find the ingredients that are not among the k most similar and set similarity to zero.
    for i in range(np.shape(similarity_matrix)[0]):
        not_kNN = similarity_matrix[i, ] < similarity_matrix[i, np.argpartition(similarity_matrix[i, ], -k)[-k]]
        similarity_matrix[i, not_kNN] = 0

    # Calculate the ingredient scores.
    ingredient_scores = np.matmul(similarity_matrix, partial_recipes.T) / np.sum(abs(similarity_matrix), axis = 1)[:, None]
    ingredient_scores = ingredient_scores.T
    
    # Set ingredient scores of already present ingredients to zero.
    ingredient_scores[partial_recipes == 1] = 0
   
    # For each recipe, get the indices of the *n_recommendations* highest-scoring ingredients in order.
    recommendations_idx = np.argsort(-ingredient_scores, axis = 1)[:, :n_recommendations]
    
    # Convert recommendation indices to ingredient names.
    recommendations = user_item_matrix.columns[recommendations_idx]
    
    return recommendations

In [18]:
# Apply CF with cosine similarity to original recipes.
recommendations = recommend_ingredients(X.head(1050), X.head(1050), similarity_measure = "asymmetric_cosine")

In [19]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 6

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X.columns[np.where(X.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X.columns[np.where(X.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe2,]))

Recipe 16903 (mexican): 
Index(['cheddar_cheese', 'jalapeno_chilies', 'lettuce', 'lime', 'pork',
       'purple_onion', 'peppers', 'olive', 'cilantro_chopped_fresh',
       'pepper_ground_black', 'tortillas_corn'],
      dtype='object')

Recommended ingredients: 
Index(['tomatoes', 'tortilla', 'beans', 'salsa', 'chile', 'avocado',
       'cilantro_chopped', 'beans_black', 'cream_sour', 'mango'],
      dtype='object')


Recipe 3735 (italian): 
Index(['eggs', 'flour', 'nuts', 'sugar', 'powder_baking', 'olive',
       'extract_vanilla'],
      dtype='object')

Recommended ingredients: 
Index(['milk', 'peaches', 'sugar_white', 'allpurpose_flour', 'cream',
       'baking_soda', 'butter', 'large_eggs', 'cornmeal', 'frozen_peas'],
      dtype='object')




### Evaluation

In [21]:
def held_out_recommendation(user_item_matrix, model_config=[10, "cosine", None, 10]):
    """
    Returns a list of held out ingredients and a list of corresponding recommendations
    """
    held_out_ingredients = []
    recommendations = []

    for index, row in user_item_matrix.iterrows():
        # Current training data: exclude the recipe tested
        X_curr = user_item_matrix.copy()
        X_curr.drop(index, inplace=True)
        
        # Current testing example: remove one ingredient
        recipe = row.copy()
        ing = recipe[recipe==1].sample(axis=0, random_state = 1).index.values[0]
        recipe[ing] = 0

        # Model tested
        k = model_config[0]
        similarity_measure = model_config[1]
        similarity_matrix = model_config[2]
        n_recommendations = model_config[3]
        
        # Get recommendations
        recommendation = recommend_ingredients(pd.DataFrame(recipe).T, X_curr, k, similarity_measure, similarity_matrix, n_recommendations)[0]
        
        # Store the removed ingredient and corresponding recommendations
        held_out_ingredients.append(ing)
        recommendations.append(recommendation)
        
    return (held_out_ingredients, recommendations)

# Example
held_out_recommendation(X.head())[1]
    

[array(['yellow_corn', 'vegetable', 'pepper_ground_black', 'pepper_ground',
        'thyme', 'powder_garlic', 'mayonaise', 'chilies_green',
        'yellow_onion', 'chicken'], dtype=object),
 array(['ground_ginger', 'ginger_fresh', 'cinnamon_ground', 'sugar',
        'powder_baking', 'extract_vanilla', 'butter', 'olives_black',
        'garlic', 'purple_onion'], dtype=object),
 array(['ground_ginger', 'ginger_fresh', 'cinnamon_ground', 'sugar',
        'powder_baking', 'extract_vanilla', 'olives_black', 'garlic',
        'purple_onion', 'seasoning'], dtype=object),
 array(['ginger_fresh', 'extract_vanilla', 'ground_ginger', 'sugar',
        'powder_baking', 'cinnamon_ground', 'eggs', 'vegetable', 'thyme',
        'yellow_corn'], dtype=object),
 array(['chilies_green', 'mayonaise', 'chicken_breasts', 'chicken',
        'powder_garlic', 'yellow_onion', 'yellow_corn', 'flour',
        'pepper_ground', 'thyme'], dtype=object)]

In [22]:
def metric_1(missing_ingredients, recommendations):
    """
    Returns the percentage of recipes for which the missing ingredient
    is among the top-10 recommended ingredients. (Mean Precision @ 10)
    """    
    matches = [1 for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i][:10]]
    
    return len(matches)/len(missing_ingredients)


def metric_2(missing_ingredients, recommendations):
    """
    Mean rank of the missing ingredients in the list of recommended ingredients.
    """
    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return sum(ranks)/len(ranks)


def metric_3(missing_ingredients, recommendations):
    """
    Median rank of the missing ingredients in the list of recommended ingredients.
    """
    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return statistics.median(sorted(ranks))

In [23]:
def calculate_metrics(missing_ingredients, recommendations, model_config):
    """Calculate three evaluation metrics of recommendations made.
    
    Inputs:
        - missing_ingredients: list of the held-out ingredients.
        - recommendations: list of arrays with corresponding recommendations.
        - model_config: model settings used to make the recommendations.
        
    Outputs a dataframe with:
        - crucial model settings.
        - percentage of recipes for which the missing ingredient is among the top-10 recommended ingredients.
        - mean rank of the missing ingredients in the list of recommended ingredients.
        - median rank of the missing ingredients in the list of recommended ingredients.    
    """
    
    metrics = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])
    metrics.loc[0, "k"]                  = model_config[0]
    metrics.loc[0, "similarity_measure"] = model_config[1]
    metrics.loc[0, "top10_presence"]     = metric_1(missing_ingredients, recommendations) 
    metrics.loc[0, "mean_rank"]          = metric_2(missing_ingredients, recommendations)
    metrics.loc[0, "median_rank"]        = metric_3(missing_ingredients, recommendations)
    
    return metrics

**Split the data into train and test set**

In [25]:
X_train, X_test = train_test_split(X, test_size = 0.5, random_state = 1)

In [26]:
n_recommendations = len(X.columns) - 1
results = pd.DataFrame(columns = ["k", "similarity_measure", "top10_presence", "mean_rank", "median_rank"])

k_grid = [5, 10, 20, 50, 100]
similarity_measures = ["cosine"] #, "asymmetric_cosine", "jaccard"]

user_item_matrix = X.head(1400)
# user_item_matrix = X_train

for k, sim in itertools.product(k_grid, similarity_measures):
    
    model_config = [k, sim, None, n_recommendations]
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)
    
    results = results.append(calculate_metrics(missing_ingredients, recommendations, model_config), ignore_index = True)

In [27]:
results

Unnamed: 0,k,similarity_measure,top10_presence,mean_rank,median_rank
0,5,cosine,0.261429,77.9349,41
1,10,cosine,0.297857,65.4399,30
2,20,cosine,0.317143,56.9964,26
3,50,cosine,0.309286,52.4807,26
4,100,cosine,0.256429,51.8986,29


In [None]:
results.to_csv('results/parameter_settings_head1400.csv', sep = '\t')

### Parameter setting