## Compute different similarity matrices

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
from similarity_functions import asymmetric_cosine, jaccard 
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

**Load data**

In [4]:
# Load data.
X_train = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X_train.index, inplace = True)

## Cosine similarity

In [5]:
# Calculate cosine similarities.
sims = cosine_similarity(X_train.T)

# Change diagonal to zero, so that ingredients have 0 similarity with themselves
np.fill_diagonal(sims, 0)

# Display first rows of the similarity matrix
pd.DataFrame(sims).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,310
0,0.0,0.165953,0.0,0.002949,0.003139,0.0,0.050012,0.0,0.0,0.004092,...,0.003395,0.047309,0.004942,0.013176,0.004458,0.004036,0.0,0.0,0.041097,0.006201
1,0.165953,0.0,0.012783,0.006616,0.079227,0.009064,0.285872,0.041299,0.00549,0.018747,...,0.026663,0.182676,0.049205,0.054512,0.032513,0.020944,0.025429,0.010258,0.084882,0.013913
2,0.0,0.012783,0.0,0.005582,0.007922,0.004079,0.0,0.0,0.021616,0.013772,...,0.017139,0.018908,0.0,0.008315,0.002813,0.005094,0.0,0.010652,0.004715,0.015652
3,0.002949,0.006616,0.005582,0.0,0.015825,0.008576,0.001831,0.0,0.012986,0.081082,...,0.003604,0.050218,0.023605,0.01049,0.024846,0.095331,0.00401,0.013438,0.003966,0.023036
4,0.003139,0.079227,0.007922,0.015825,0.0,0.014605,0.036047,0.006239,0.011058,0.054711,...,0.021097,0.039199,0.006979,0.007444,0.021409,0.015962,0.014227,0.007946,0.002111,0.007006


## Jaccard similarity

In [10]:
# Display the first rows in the matrix   
jacs = jaccard(X_train)
pd.DataFrame(jacs).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,310
0,0.0,0.045098,0.0,0.001222,0.001346,0.0,0.022925,0.0,0.0,0.001622,...,0.001667,0.012584,0.00246,0.006536,0.002181,0.001923,0.0,0.0,0.020073,0.003091
1,0.045098,0.0,0.002715,0.002741,0.032585,0.00207,0.120146,0.008437,0.00159,0.00814,...,0.005847,0.100511,0.014315,0.012361,0.01023,0.007087,0.007232,0.002646,0.017256,0.0033
2,0.0,0.002715,0.0,0.001999,0.002963,0.002037,0.0,0.0,0.010249,0.004676,...,0.008639,0.004064,0.0,0.004167,0.001276,0.002208,0.0,0.005226,0.002358,0.007828
3,0.001222,0.002741,0.001999,0.0,0.00796,0.003249,0.000905,0.0,0.005774,0.042121,...,0.001318,0.02133,0.010496,0.003927,0.011564,0.047568,0.001751,0.005556,0.001356,0.009015
4,0.001346,0.032585,0.002963,0.00796,0.0,0.005772,0.018272,0.002256,0.005051,0.027843,...,0.0081,0.015977,0.003169,0.002899,0.01018,0.007795,0.006427,0.003392,0.000754,0.002831


## Asymmetric cosine similarity

In [11]:
# Display the first rows in the matrix   
asims = asymmetric_cosine(X_train)
pd.DataFrame(asims).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,310
0,0.0,0.077398,0.0,0.002016,0.002228,0.0,0.037572,0.0,0.0,0.002671,...,0.003829,0.022223,0.004606,0.014595,0.003907,0.003332,0.0,0.0,0.049096,0.006623
1,0.355828,0.0,0.031518,0.009701,0.120595,0.021281,0.460482,0.104848,0.010909,0.026233,...,0.064483,0.183989,0.098345,0.129468,0.061091,0.037075,0.051407,0.022162,0.217423,0.031864
2,0.0,0.005185,0.0,0.00332,0.004891,0.003884,0.0,0.0,0.017421,0.007817,...,0.016812,0.007724,0.0,0.00801,0.002144,0.003657,0.0,0.009334,0.004899,0.014539
3,0.004312,0.004513,0.009387,0.0,0.016429,0.013733,0.002011,0.0,0.017599,0.077386,...,0.005944,0.034497,0.032178,0.016992,0.031841,0.115095,0.005529,0.019801,0.006929,0.035984
4,0.004421,0.05205,0.012833,0.015243,0.0,0.022528,0.038147,0.010405,0.014435,0.050298,...,0.033521,0.025938,0.009164,0.011615,0.026428,0.018563,0.018896,0.011279,0.003552,0.010541


## Pointwise mutual information (PMI)

## Create recommendations

In [None]:
def recommend_ingredients(partial_recipes, user_item_matrix, k = 10, similarity_measure = "cosine", 
                          similarity_matrix = None, n_recommendations = 10):
    """Recommend ingredients to (partial) recipes based on the similarity between ingredients.
    
    Inputs:
        - partial_recipes: pandas dataframe of recipes that ingredient recommendations are produced for. Should be
                           of the shape recipes x ingredients.
        - user_item_matrix: pandas dataframe of training recipes. Should be of the shape recipes x ingredients.
        - k: number of neighbours (ingredients) used when calculating the ingredient recommendation scores.
        - similarity_measure: the measure used for calculating the similarity between ingredients. One of
                              'cosine', 'asymmetric_cosine', 'jaccard', 'pmi'.
        - similarity_matrix: the precomputed matrix of ingredient similarities. If not given, this will be
                             computed by the function.
        - n_recommendations: the desired number of recommended ingredients per recipe.
        
    Outputs a matrix of the recommended ingredients (columns) for the given partial recipes (rows).
        
    """
    
    # Calculate the similarity matrix if none was given as input.
    if similarity_matrix == None:
        if similarity_measure == "cosine":
            similarity_matrix = cosine_similarity(user_item_matrix.T)
        elif similarity_measure == "asymmetric_cosine":
            pass
        elif similarity_measure == "jaccard":
            pass
        elif similarity_measure == "pmi":
            pass
        else: 
            raise ValueError("The similarity measure must be one of: 'cosine', 'asymmetric_cosine', 'jaccard', 'pmi'.")
    
    # Set similarity to self to zero.
    np.fill_diagonal(similarity_matrix, 0) 
    
    # For each ingredient, find the ingredients that are not among the k most similar and set similarity to zero.
    for i in range(np.shape(similarity_matrix)[0]):
        not_kNN = similarity_matrix[i, ] < sims[i, np.argpartition(sims[i, ], -k)[-k]]
        similarity_matrix[i, not_kNN] = 0

    # Calculate the ingredient scores.
    ingredient_scores = np.matmul(similarity_matrix, partial_recipes.T) / np.sum(abs(similarity_matrix), axis = 1)[:, None]
    ingredient_scores = ingredient_scores.T
    
    # Set ingredient scores of already present ingredients to zero.
    ingredient_scores[partial_recipes == 1] = 0
    
    # For each recipe, get the indices of the *n_recommendations* highest-scoring ingredients.
    recommendations_idx = np.argpartition(ingredient_scores, -n_recommendations).T[-n_recommendations:].T
    
    # Convert recommendation indices to ingredient names.
    recommendations = user_item_matrix.columns[recommendations_idx]
    
    return recommendations

In [None]:
# Apply CF with cosine similarity to original recipes.
recommendations = recommend_ingredients(X_train, X_train)

In [None]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 4
recipe3 = 6

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe2,]))

print('Recipe {} ({}): \n{}\n'.format(X_train.index[recipe3],
                                      cuisines.iloc[recipe3, 0],
                                      X_train.columns[np.where(X_train.iloc[recipe3,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe3,]))