In [83]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import statistics


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection  import train_test_split
from similarity_functions     import asymmetric_cosine, jaccard 

#from sklearn.metrics import jaccard_similarity_score
# from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

%matplotlib inline

In [48]:
# Load data.
X_train = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X_train.index, inplace = True)

In [49]:
# Calculate cosine similarities.
sims = cosine_similarity(X_train.T)
np.fill_diagonal(sims, 0)

In [50]:
def recommend_ingredients(partial_recipes, user_item_matrix, k = 10, similarity_measure = "cosine", 
                          similarity_matrix = None, n_recommendations = 10, alpha = 0.2):
    """Recommend ingredients to (partial) recipes based on the similarity between ingredients.
    
    Inputs:
        - partial_recipes: pandas dataframe of recipes that ingredient recommendations are produced for. Should be
                           of the shape recipes x ingredients.
        - user_item_matrix: pandas dataframe of training recipes. Should be of the shape recipes x ingredients.
        - k: number of neighbours (ingredients) used when calculating the ingredient recommendation scores.
        - similarity_measure: the measure used for calculating the similarity between ingredients. One of
                              'cosine', 'asymmetric_cosine', 'jaccard', 'pmi'.
        - similarity_matrix: the precomputed matrix of ingredient similarities. If not given, this will be
                             computed by the function.
        - n_recommendations: the desired number of recommended ingredients per recipe.
        
    Outputs a matrix of the recommended ingredients (columns) for the given partial recipes (rows).
        
    """
    
    # Calculate the similarity matrix if none was given as input.
    if similarity_matrix == None:
        
        if similarity_measure == "cosine":
            similarity_matrix = cosine_similarity(user_item_matrix.T)
            
        elif similarity_measure == "asymmetric_cosine":
            similarity_matrix = asymmetric_cosine(user_item_matrix, alpha)
            
        elif similarity_measure == "jaccard":
            similarity_matrix = jaccard(user_item_matrix)
            
        elif similarity_measure == "pmi":
            pass
        
        else: 
            raise ValueError("The similarity measure must be one of: 'cosine', 
                             'asymmetric_cosine', 'jaccard', 'pmi'.")
    
    # Set similarity to self to zero.
    np.fill_diagonal(similarity_matrix, 0) 
    
    # For each ingredient, find the ingredients that are not among the k most similar and set similarity to zero.
    for i in range(np.shape(similarity_matrix)[0]):
        not_kNN = similarity_matrix[i, ] < sims[i, np.argpartition(sims[i, ], -k)[-k]]
        similarity_matrix[i, not_kNN] = 0

    # Calculate the ingredient scores.
    ingredient_scores = np.matmul(similarity_matrix, partial_recipes.T) / 
                             np.sum(abs(similarity_matrix), axis = 1)[:, None]
    ingredient_scores = ingredient_scores.T
    
    # Set ingredient scores of already present ingredients to zero.
    ingredient_scores[partial_recipes == 1] = 0
    
    # For each recipe, get the indices of the *n_recommendations* highest-scoring ingredients.
    recommendations_idx = np.argpartition(ingredient_scores, -n_recommendations).T[-n_recommendations:].T
    
    # Convert recommendation indices to ingredient names.
    recommendations = user_item_matrix.columns[recommendations_idx]
    
    return recommendations

In [51]:
# Apply CF with cosine similarity to original recipes.
recommendations = recommend_ingredients(X_train, X_train)

In [6]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 4
recipe3 = 6

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe2,]))

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe3],
                                      cuisines.iloc[recipe3, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe3,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe3,]))

Recipe 3735 (italian): 
Index(['bakingpowder', 'eggs', 'flour', 'oliveoil', 'sugar', 'vanillaextract'], dtype='object')

Recommended ingredients: 
Index(['buttermilk', 'purposeflour', 'unsaltedbutter', 'largeeggs', 'butter',
       'bakingsoda', 'granulatedsugar', 'whitesugar', 'milk',
       'groundcinnamon'],
      dtype='object')


Recipe 13162 (indian): 
Index(['blackpepper', 'butter', 'cayennepepper', 'chicken', 'chilipowder',
       'flour', 'garammasala', 'garlic', 'groundcumin', 'lemonjuice', 'milk',
       'oil', 'onions', 'salt', 'shallots', 'water'],
      dtype='object')

Recommended ingredients: 
Index(['cumin', 'carrots', 'vegetableoil', 'garlicpowder', 'groundturmeric',
       'chickenbroth', 'bayleaves', 'paprika', 'pepper', 'potatoes'],
      dtype='object')


Recipe 42779 (spanish): 
Index(['cilantro', 'flatleafparsley', 'garlic', 'jalapenochilies', 'oliveoil',
       'pepper', 'salt', 'seasalt', 'shrimp'],
      dtype='object')

Recommended ingredients: 
Index(['shal

### Evaluation

In [70]:
def held_out_recommendation(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Returns a list of held out ingredients and a list of corresponding recommendations
    """
    held_out_ingredients = []
    recommendations = []

    for index, row in user_item_matrix.iterrows():
        # Current training data: exclude the recipe tested
        X_curr = user_item_matrix.copy()
        X_curr.drop(index, inplace=True)
        
        # Current testing example: remove one ingredient
        recipe = row.copy()
        ing = recipe[recipe==1].sample(axis=0).index.values[0]
        recipe[ing] = 0
        
        # Model tested
        k = model_config[0]
        similarity_measure = model_config[1]
        similarity_matrix = model_config[2]
        n_recommendations = model_config[3]
        
        # Get recommendations
        recommendation = recommend_ingredients(recipe, X_curr, k, similarity_measure, similarity_matrix, n_recommendations)[0]
        
        # Store the removed ingredient and corresponding recommendations
        held_out_ingredients.append(ing)
        recommendations.append(recommendation)
        
    return (held_out_ingredients, recommendations)

# Example
held_out_recommendation(X_train.head())
    



(['lettuce', 'thyme', 'powder_garlic', 'leaf', 'ginger_fresh'],
 [array(['mustard_seeds', 'orange', 'linguine', 'mozzarella_cheese',
         'monterey_jack', 'mirin', 'mint', 'margarine', 'mango',
         'pepper_yellow_bell'], dtype=object),
  array(['noodles', 'orange_juice', 'mango', 'mushrooms',
         'mozzarella_cheese', 'monterey_jack', 'mirin', 'mint', 'margarine',
         'pepper_yellow_bell'], dtype=object),
  array(['mustard', 'olives', 'linguine', 'monterey_jack', 'mirin', 'mint',
         'mayonaise', 'margarine', 'mango', 'pepper_yellow_bell'],
        dtype=object),
  array(['mustard', 'okra', 'linguine', 'monterey_jack', 'mirin', 'mint',
         'masala', 'margarine', 'mango', 'pepper_yellow_bell'], dtype=object),
  array(['noodles', 'orange_juice', 'mango', 'mushrooms',
         'mozzarella_cheese', 'monterey_jack', 'mirin', 'mint', 'margarine',
         'pepper_yellow_bell'], dtype=object)])

In [95]:
def metric_1(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Returns the percentage of recipes for which the missing ingredient
    is among the top-10 recommended ingredients. (Mean Precision @ 10)
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)
    
    matches = [1 for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i][:10]]

    return len(matches)/len(missing_ingredients)

def metric_2(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Mean rank of the missing ingredients in the list of recommended ingredients.
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)

    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return sum(ranks)/len(ranks)

def metric_3(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Median rank of the missing ingredients in the list of recommended ingredients.
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)

    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return statistics.median(sorted(ranks))


# Examples        
#print(metric_1(X_train.head(1000)))
#print(metric_2(X_train.head(1000)))
#print(metric_3(X_train.head(1000)))



4.5
3.5
