In [31]:
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
def most_similar(factors, ingredient, topn=10):
    if ingredient not in key_to_row:
        print("Unknown ingredient.")
    factor = factors[key_to_row[ingredient]]
    cosines = cosine_similarity([factor], factors)[0]
    indices = cosines.argsort()[::-1][:topn + 1]
    keys = [row_keys[idx] for idx in indices if idx != key_to_row[ingredient]]
    return keys, cosines[indices]

def display_most_similar(factors, ingredient, topn=10):
    print("- Most similar to '{}'".format(ingredient))
    for similar_ing, score in zip(*most_similar(ingredient, topn)):
        print("  . {} : {:.2f}".format(similar_ing, score))    

In [33]:
def metric_1(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Returns the percentage of recipes for which the missing ingredient
    is among the top-10 recommended ingredients. (Mean Precision @ 10)
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)
    
    matches = [1 for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i][:10]]

    return len(matches)/len(missing_ingredients)

def metric_2(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Mean rank of the missing ingredients in the list of recommended ingredients.
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)

    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return sum(ranks)/len(ranks)

def metric_3(user_item_matrix, model_config=[10,"cosine", None, 10]):
    """
    Median rank of the missing ingredients in the list of recommended ingredients.
    """
    
    missing_ingredients, recommendations = held_out_recommendation(user_item_matrix, model_config)

    ranks = [np.where(missing_ingredients[i] == recommendations[i])[0][0]
             for i in range(len(missing_ingredients)) if missing_ingredients[i] in recommendations[i]]

    return statistics.median(sorted(ranks))


# Examples        
#print(metric_1(X_train.head(1000)))
#print(metric_2(X_train.head(1000)))
#print(metric_3(X_train.head(1000)))

In [34]:
# Load the string data
# data = pd.concat([pd.read_json('data/train.json'), pd.read_json('data/test.json')]).reset_index()

data = pd.read_json('data/train.json')

## Ingredient co-occurrences.

The number of recipes in which two ingredients occurred together is an intuitive measure of how common it is to see two ingredients mixed up. It is possible to factorize this matrix directly, but it has limitations (see below).

In [35]:
# Calculating ingredient counts and co-occurrences in recipes.
cooc_counts = Counter()
ing_count  = Counter()
for ingredients in data.ingredients:
    for ing in ingredients:
        ing_count[ing] += 1 # Count the number of occurences of any ingredient
    for (ing_a, ing_b) in itertools.combinations(set(ingredients), 2):
        if ing_a > ing_b: # Make sure we added pairs in a consistent order (a < b)
            ing_a, ing_b = ing_b, ing_a
        cooc_counts[(ing_a, ing_b)] += 1 # Count the co-occurences

In [36]:
# Example
cooc = pd.DataFrame(((ing_a, ing_b, ing_count[ing_a], ing_count[ing_b], cooc) for (ing_a, ing_b), cooc in cooc_counts.items()), columns=['a', 'b', 'a_count', 'b_count', 'cooc'])
cooc.sample(10)

Unnamed: 0,a,b,a_count,b_count,cooc
425210,halloumi cheese,salt,3,18049,1
323085,king prawns,olive oil,9,7972,1
37697,blue corn tortilla chips,jalapeno chilies,3,1730,2
426632,fresh basil,monterey jack,1137,235,1
368310,blanched almonds,plums,93,66,1
189945,dandelion greens,napa cabbage,8,226,1
357458,noodles,soba,225,31,1
405584,cauliflower,medium shrimp,221,461,1
218675,chopped walnuts,stilton cheese,128,18,1
441418,honey,old-fashioned oats,1299,14,1


In [37]:
cooc[cooc.a == 'chillies'].sort_values('cooc', ascending=False).head(10) # Ingredients with which chillies occur most often.
# Elements with the highest co-occurrence count are not necessarily very similar, just overall very popular ingredients.

Unnamed: 0,a,b,a_count,b_count,cooc
3166,chillies,salt,121,18049,54
3169,chillies,garlic,121,7380,50
3172,chillies,onions,121,7972,48
3168,chillies,ginger,121,1755,40
19403,chillies,vegetable oil,121,4385,28
3134,chillies,water,121,7457,28
58926,chillies,garam masala,121,925,28
21189,chillies,coriander,121,458,27
81006,chillies,garlic cloves,121,6237,26
3170,chillies,tumeric,121,738,24


## Matrix factorization based on co-occurences

In [38]:
# Since the matrix is symetric, we add the same values for (b,a) as we have for (a,b)
data_df_t = cooc.copy()
cooc.a, cooc.b = cooc.b, cooc.a
cooc = pd.concat([cooc, data_df_t])

rows_idx, row_keys = pd.factorize(cooc.a)
cols_idx, col_keys = pd.factorize(cooc.b)
values = cooc.cooc

matrix_cooc = csr_matrix((values, (rows_idx, cols_idx)))
key_to_row = {key: idx for idx, key in enumerate(row_keys)}

In [39]:
svd = TruncatedSVD(200)
factors_cooc = svd.fit_transform(matrix_cooc)

In [41]:
most_similar(factors_cooc, "chicken")

(['chicken breasts',
  'chicken thighs',
  'chicken legs',
  'chicken pieces',
  'cayenne',
  'bell pepper',
  'boneless skinless chicken breasts',
  'bay leaves',
  'boneless chicken breast',
  'bay leaf'],
 array([1.        , 0.95370661, 0.95324395, 0.93406003, 0.93202486,
        0.93033817, 0.92948365, 0.92761332, 0.92284256, 0.9168282 ,
        0.91502648]))

## Matrix Factorization (SVD)



There's an issue with the approach we used previously: we are only leveraging direct correlations between ingredients (say, the fact that there are 15 recipes with both `sushi rice` and `wasabi`) and not using all the knowledge that can be extracted from more subtle correlations, which is particularly useful for less popular items.
Example: There might not be many recipes between some particular type of Mexican pepper and `corn tortillas`, but since that pepper appears with other ingredients similar to a `tortilla`, we would expect it to be similar to `corn tortillas`.

One solution to this *sparseness* problem (the fact that most pairs of ingredients have little to no co-occurrences) is to use [Matrix Factorization](https://en.wikipedia.org/wiki/Matrix_decomposition)).

- First, we create a matrix where rows and columns represent ingredients, and the values are the PMI of a pair of ingredients. (you might also use a binary co-occurrence signal, e.g 1 if there's any recipe with both ingredients, 0 otherwise; or use the raw number of co-occurrences, but PMI makes more sense in our case)
- We factorize this matrix: You can think of it as "compressing" our matrix from a large but sparse NxN matrix, where N is the number of ingredients, to a smaller but dense NxK matrix, where K is a number that we choose (hereset to 120 as it gave decent results).

Matrix factorization is helpful because it _generalizes_ the knowledge we have about ingredients, and removes noise and redundancies in the data. The output of this step is a vector representing each ingredient, vectors that we can compare to each other using various similarity metrics. Given that in this case, the most popular ingredients will have larger vectors, we prefer [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) since it is not biased by a vector's norm.

## Pointwise Mutual Information

The issue with raw counts (co-occurences affected by the overall popularity of an ingredient). Solution -> [PMI](https://en.wikipedia.org/wiki/Pointwise_mutual_information):

$$PMI(A, B) = log \frac{P(A, B)}{P(A) \times P(B)}$$

In [42]:
# P(A) is counts(A) / num_recipes
# P(A, B) is coocs(A, B) / sum(coocs)
p_a = cooc.a_count / sum(ing_count.values())
p_b = cooc.b_count / len(ing_count.values())
p_a_b = cooc.cooc / cooc.cooc.sum()
cooc['pmi'] = np.log(p_a_b / (p_a * p_b))

todo: 
- remove rare ingridients (noisy PMI value, we didn't see them in enough contexts to give enough support to the "lift ratio" that PMI is)
- unigrams/bigrams (beacause vinegar/red vinegar etc should be treated similarly)

In [43]:
cooc.sort_values('pmi', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
275252,Ragu Classic Alfredo Sauce,Ragu Golden Veggie Fettuccine Pasta,1,1,1,6.361983
88191,soft-shell clams,soft-shell clams,1,1,1,6.361983
221228,white creme de cacao,white creme de cacao,1,1,1,6.361983
124236,Hawaiian salt,raw buckwheat groats,1,1,1,6.361983
120741,old fashioned stone ground grits,old fashioned stone ground grits,1,1,1,6.361983
200557,dumpling dough,red vinegar,1,1,1,6.361983
124242,sliced mango,sliced mango,1,1,1,6.361983
277095,Flora Original,Knorr Chicken Stock Cubes,1,1,1,6.361983
124236,raw buckwheat groats,raw buckwheat groats,1,1,1,6.361983
202774,San Marzano Diced Tomatoes,royal olives,1,1,1,6.361983


For low values of `min_count`, we get some very peculiar pairs which are likely due to the recipes being from the same website that has some advertising partnership with the brands mentionned.

In [21]:
min_count = 5
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
172118,Herdez Salsa Verde,Herdez Salsa Verde,5,6,3,4.059398
172118,Herdez Salsa Casera,Herdez Salsa Verde,5,6,3,4.059398
208021,porridge oats,porridge oats,7,5,3,3.905247
208021,black treacle,porridge oats,7,5,3,3.905247
400930,stone flower,stone flower,6,6,3,3.877076
400930,kewra water,stone flower,6,6,3,3.877076
134551,sazon seasoning,sofrito,5,5,2,3.836254
134551,sofrito,sofrito,5,5,2,3.836254
215356,bertolli vineyard premium collect marinara wit...,"clams, well scrub",5,6,2,3.653933
116920,Red Gold® diced tomatoes,Red Gold® diced tomatoes,6,5,2,3.653933


In [31]:
min_count = 30
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

# real correlations start to appear from the count 30

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
35259,mexican chocolate,pasilla chiles,41,38,12,2.248946
171967,gari,wasabi,57,34,13,2.110735
69469,juniper berries,sauerkraut,30,40,7,1.971031
14628,brown cardamom,green cardamom,55,107,34,1.961396
217884,cilantro root,shrimp paste,30,67,11,1.907203
62582,bonito flakes,konbu,52,85,22,1.812345
84108,asafoetida powder,fresh curry leaves,45,91,20,1.793408
62629,dried bonito flakes,konbu,49,85,20,1.776459
109258,sushi rice,wasabi,97,34,15,1.722177
70840,Niçoise olives,tuna steaks,30,52,7,1.708667


- filter out pairs with only one co-oc 
- we can also see which recipes are most unlikely to occur together

In [32]:
min_count = 30
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count) & (cooc.cooc > 1)].sort_values('pmi', ascending=True).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
410770,green onions,vanilla extract,3817,1626,2,-7.832753
410784,pepper,vanilla extract,5508,1626,3,-7.794025
272217,dried oregano,ginger,2163,2190,2,-7.562563
271843,fresh ginger,grated parmesan cheese,1846,2367,2,-7.48181
311190,garlic,powdered sugar,9171,616,3,-7.333238
124872,chili powder,dry white wine,2519,1492,2,-7.331145
478207,cucumber,unsalted butter,970,3474,2,-7.222013
417842,minced garlic,vanilla extract,2001,1626,2,-7.186936
105289,grated parmesan cheese,soy sauce,2367,4120,6,-7.18603
417630,soy sauce,whipping cream,4120,779,2,-7.173274


## Matrix factorization - PMI

In [48]:
data_df = cooc[cooc.pmi > 0].copy()

# Since the matrix is symetric, we add the same values for (b,a) as we have for (a,b)
data_df_t = data_df.copy()
data_df.a, data_df.b = data_df.b, data_df.a
data_df = pd.concat([data_df, data_df_t])

rows_idx, row_keys = pd.factorize(data_df.a)
cols_idx, col_keys = pd.factorize(data_df.b)
values = data_df.pmi

matrix = csr_matrix((values, (rows_idx, cols_idx)))
key_to_row = {key: idx for idx, key in enumerate(row_keys)}

In [49]:
svd = TruncatedSVD(200)
factors = svd.fit_transform(matrix)

In [None]:
display_most_similar(factors, 'chicken')

In [45]:
display_most_similar('harissa')

- Most similar to 'harissa'
  . couscous : 1.00
  . dried mint flakes : 0.98
  . chapati : 0.98
  . pitas : 0.96
  . bulgur wheat : 0.96
  . quinoa : 0.95
  . chickpea flour : 0.95
  . roti : 0.95
  . gouda : 0.94
  . feta cheese crumbles : 0.92


In [262]:
display_most_similar('rice noodles')

- Most similar to 'rice noodles'
  . organic low sodium chicken broth : 1.00
  . laksa paste : 0.97
  . good seasons italian dressing mix : 0.70
  . fillet red snapper : 0.64
  . egg noodles : 0.64
  . garland chrysanthemum : 0.63
  . hoisin sauce : 0.62
  . cabbage head : 0.62
  . beansprouts : 0.59
  . low sodium parmesan cheese : 0.58


In [263]:
display_most_similar('pork')

- Most similar to 'pork'
  . cabbage head : 1.00
  . kettle chips : 0.93
  . napa cabbage leaves : 0.82
  . pancit : 0.76
  . egg noodles : 0.74
  . good seasons italian dressing mix : 0.73
  . fillet red snapper : 0.69
  . garland chrysanthemum : 0.69
  . low sodium parmesan cheese : 0.67
  . reduced sodium smoked ham : 0.66


In [267]:
display_most_similar('vanilla')

- Most similar to 'vanilla'
  . poppy seed filling : 1.00
  . pandan essence : 0.99
  . raw milk : 0.97
  . sugar pearls : 0.95
  . hot cross buns : 0.93
  . cassis : 0.92
  . instant oats : 0.87
  . oil of orange : 0.68
  . refined sugar : 0.64
  . mint extract : 0.58


In [269]:
display_most_similar('whipped cream')

- Most similar to 'whipped cream'
  . hot fudge topping : 1.00
  . amarena cherries : 1.00
  . fast rising yeast : 1.00
  . gingersnap cookies : 1.00
  . meyer lemon peel : 1.00
  . ibarra : 1.00
  . blackstrap molasses : 0.99
  . chestnut purée : 0.98
  . gingersnap cookie crumbs : 0.98
  . marshmallow vodka : 0.95


In [268]:
display_most_similar('buffalo mozarella')

- Most similar to 'buffalo mozarella'
  . hot pepperoni : 1.00
  . stonefire italian thin pizza crust : 1.00
  . smoked gouda : 0.95
  . new york style panetini® toasts : 0.91
  . jarlsberg : 0.90
  . sweet yellow corn : 0.89
  . crumbled cornbread : 0.85
  . cheese ravioli : 0.82
  . bbq seasoning : 0.78
  . smoked cheddar cheese : 0.75
