In [42]:
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
data = pd.concat([pd.read_json('data/train.json'), pd.read_json('data/test.json')]).reset_index()

## Ingredient co-occurrences.

Calculate the number of recipes in which two ingredients occurred together. This intuitively gives us a measure of how common it is to see two ingredients mixed up.

In [23]:
# Calculating ingredient counts and co-occurrences in recipes.
cooc_counts = Counter()
ing_count  = Counter()
for ingredients in data.ingredients:
    for ing in ingredients:
        ing_count[ing] += 1
    for (ing_a, ing_b) in itertools.combinations(set(ingredients), 2):
        if ing_a > ing_b:
            ing_a, ing_b = ing_b, ing_a
        cooc_counts[(ing_a, ing_b)] += 1

In [24]:
cooc = pd.DataFrame(((ing_a, ing_b, ing_count[ing_a], ing_count[ing_b], cooc) for (ing_a, ing_b), cooc in cooc_counts.items()), columns=['a', 'b', 'a_count', 'b_count', 'cooc'])
cooc.sample(10)

Unnamed: 0,a,b,a_count,b_count,cooc
94653,chopped fresh chives,extra-virgin olive oil,279,3424,35
258668,cake,decorating sugars,52,13,3
83072,flour tortillas,rocket leaves,1076,58,1
123507,black pepper,rice paper,3291,93,4
468270,full-fat plain yogurt,ginger,3,2190,1
434561,cured pork,freshly ground pepper,4,1183,1
516659,cooking spray,pecans,1854,176,1
151498,chopped onion,star anise,1561,325,3
486659,fresh lemon juice,sesame,2105,33,1
380212,garlic salt,sazon seasoning,302,5,1


In [26]:
cooc[cooc.a == 'chillies'].sort_values('cooc', ascending=False).head(10) # Ingredients with which chillies occur most often.
# Elements with the highest co-occurrence count are not necessarily very similar, just overall very popular ingredients.

Unnamed: 0,a,b,a_count,b_count,cooc
3212,chillies,salt,148,22534,67
3202,chillies,onions,148,10008,63
3216,chillies,garlic,148,9171,59
3215,chillies,ginger,148,2190,47
58929,chillies,garam masala,148,1179,37
19377,chillies,vegetable oil,148,5516,36
3218,chillies,water,148,9293,32
3114,chillies,tumeric,148,933,31
21191,chillies,coriander,148,560,31
3163,chillies,tomatoes,148,3812,31


## Pointwise Mutual Information

The issue with raw counts (co-occurences affected by the overall popularity of an ingredient). Solution -> [PMI](https://en.wikipedia.org/wiki/Pointwise_mutual_information):

$$PMI(A, B) = log \frac{P(A, B)}{P(A) \times P(B)}$$

In [27]:
# P(A) is counts(A) / num_recipes
# P(A, B) is coocs(A, B) / sum(coocs)
p_a = cooc.a_count / sum(ing_count.values())
p_b = cooc.b_count / len(ing_count.values())
p_a_b = cooc.cooc / cooc.cooc.sum()
cooc['pmi'] = np.log(p_a_b / (p_a * p_b))

todo: 
- remove rare ingridients (noisy PMI value, we didn't see them in enough contexts to give enough support to the "lift ratio" that PMI is)
- unigrams/bigrams (beacause vinegar/red vinegar etc should be treated similarly)

In [28]:
cooc.sort_values('pmi', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
352549,flaked oats,malt,1,1,1,7.115198
303652,chinese jujubes,sweetened red beans,1,1,1,7.115198
181322,frozen basil,red wine vinaigrette,1,1,1,7.115198
387804,low-fat monterey jack,shredded low-fat cheddar,1,1,1,7.115198
478602,Kraft Slim Cut Mozzarella Cheese Slices,Oscar Mayer Cotto Salami,1,1,1,7.115198
25279,chinese hot mustard,soy marinade,1,1,1,7.115198
99019,low fat reduced sodium pasta sauce,whole wheat submarine loaves,1,1,1,7.115198
431639,elderflower cordial,sweet white wine,1,1,1,7.115198
275249,Ragu Classic Alfredo Sauce,Ragu Golden Veggie Fettuccine Pasta,1,1,1,7.115198
73982,knorr cilantro minicub,knorr parslei minicub,1,1,1,7.115198


For low values of `min_count`, we get some very peculiar pairs which are likely due to the recipes being from the same website that has some advertising partnership with the brands mentionned.

In [30]:
min_count = 5
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
116918,Gourmet Garden garlic paste,Pompeian Canola Oil and Extra Virgin Olive Oil,5,6,5,5.323438
326899,buttermilk cornbread,muffin mix,6,5,4,5.100295
69423,brats,knockwurst,8,5,4,4.812613
134513,sazon seasoning,sofrito,5,5,2,4.589469
415912,KRAFT Shredded Cheddar Cheese,TACO BELL® Thick & Chunky Mild Salsa,5,5,2,4.589469
116909,Gourmet Garden garlic paste,Johnsonville Andouille,5,10,4,4.589469
172104,Herdez Salsa Casera,Herdez Salsa Verde,5,8,3,4.524931
208025,black treacle,porridge oats,8,5,3,4.524931
100504,Chinese rose wine,maltose,5,9,3,4.407148
344381,cola soft drink,cooked bone in ham,6,5,2,4.407148


In [31]:
min_count = 30
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

# real correlations start to appear from the count 30

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
35259,mexican chocolate,pasilla chiles,41,38,12,2.248946
171967,gari,wasabi,57,34,13,2.110735
69469,juniper berries,sauerkraut,30,40,7,1.971031
14628,brown cardamom,green cardamom,55,107,34,1.961396
217884,cilantro root,shrimp paste,30,67,11,1.907203
62582,bonito flakes,konbu,52,85,22,1.812345
84108,asafoetida powder,fresh curry leaves,45,91,20,1.793408
62629,dried bonito flakes,konbu,49,85,20,1.776459
109258,sushi rice,wasabi,97,34,15,1.722177
70840,Niçoise olives,tuna steaks,30,52,7,1.708667


- filter out pairs with only one co-oc 
- we can also see which recipes are most unlikely to occur together

In [32]:
min_count = 30
cooc[(cooc.a_count >= min_count) & (cooc.b_count >= min_count) & (cooc.cooc > 1)].sort_values('pmi', ascending=True).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
410770,green onions,vanilla extract,3817,1626,2,-7.832753
410784,pepper,vanilla extract,5508,1626,3,-7.794025
272217,dried oregano,ginger,2163,2190,2,-7.562563
271843,fresh ginger,grated parmesan cheese,1846,2367,2,-7.48181
311190,garlic,powdered sugar,9171,616,3,-7.333238
124872,chili powder,dry white wine,2519,1492,2,-7.331145
478207,cucumber,unsalted butter,970,3474,2,-7.222013
417842,minced garlic,vanilla extract,2001,1626,2,-7.186936
105289,grated parmesan cheese,soy sauce,2367,4120,6,-7.18603
417630,soy sauce,whipping cream,4120,779,2,-7.173274


## Matrix Factorization (SVD)



There's an issue with the approach we used previously: we are only leveraging direct correlations between ingredients (say, the fact that there are 15 recipes with both `sushi rice` and `wasabi`) and not using all the knowledge that can be extracted from more subtle correlations, which is particularly useful for less popular items.
Example: There might not be many recipes between some particular type of Mexican pepper and `corn tortillas`, but since that pepper appears with other ingredients similar to a `tortilla`, we would expect it to be similar to `corn tortillas`.

One solution to this *sparseness* problem (the fact that most pairs of ingredients have little to no co-occurrences) is to use [Matrix Factorization](https://en.wikipedia.org/wiki/Matrix_decomposition)).

- First, we create a matrix where rows and columns represent ingredients, and the values are the PMI of a pair of ingredients. (you might also use a binary co-occurrence signal, e.g 1 if there's any recipe with both ingredients, 0 otherwise; or use the raw number of co-occurrences, but PMI makes more sense in our case)
- We factorize this matrix: You can think of it as "compressing" our matrix from a large but sparse NxN matrix, where N is the number of ingredients, to a smaller but dense NxK matrix, where K is a number that we choose (hereset to 120 as it gave decent results).

Matrix factorization is helpful because it _generalizes_ the knowledge we have about ingredients, and removes noise and redundancies in the data. The output of this step is a vector representing each ingredient, vectors that we can compare to each other using various similarity metrics. Given that in this case, the most popular ingredients will have larger vectors, we prefer [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) since it is not biased by a vector's norm.

In [40]:
data_df = cooc[cooc.pmi > 0].copy()
# Since the matrix is symetric, we add the same values for (b,a) as we have for (a,b)
data_df_t = data_df.copy()
data_df.a, data_df.b = data_df.b, data_df.a
data_df = pd.concat([data_df, data_df_t])

rows_idx, row_keys = pd.factorize(data_df.a)
cols_idx, col_keys = pd.factorize(data_df.b)
values = data_df.pmi

matrix = csr_matrix((values, (rows_idx, cols_idx)))
key_to_row = {key: idx for idx, key in enumerate(row_keys)}

In [41]:
svd = TruncatedSVD(200)
factors = svd.fit_transform(matrix)

In [43]:
def most_similar(ingredient, topn=10):
    if ingredient not in key_to_row:
        print("Unknown ingredient.")
    factor = factors[key_to_row[ingredient]]
    cosines = cosine_similarity([factor], factors)[0]
    indices = cosines.argsort()[::-1][:topn + 1]
    keys = [row_keys[idx] for idx in indices if idx != key_to_row[ingredient]]
    return keys, cosines[indices]

def display_most_similar(ingredient, topn=10):
    print("- Most similar to '{}'".format(ingredient))
    for similar_ing, score in zip(*most_similar(ingredient, topn)):
        print("  . {} : {:.2f}".format(similar_ing, score))    

In [44]:
display_most_similar('chile powder')

- Most similar to 'chile powder'
  . Kraft Sharp Cheddar Cheese : 1.00
  . full fat sour cream : 0.69
  . baking apples : 0.69
  . organic butter : 0.67
  . colby jack cheese : 0.62
  . cottage cheese : 0.58
  . Tapatio Hot Sauce : 0.55
  . Mexican oregano : 0.55
  . bitter melon : 0.53
  . barbecued pork : 0.51


In [45]:
display_most_similar('harissa')

- Most similar to 'harissa'
  . couscous : 1.00
  . dried mint flakes : 0.98
  . chapati : 0.98
  . pitas : 0.96
  . bulgur wheat : 0.96
  . quinoa : 0.95
  . chickpea flour : 0.95
  . roti : 0.95
  . gouda : 0.94
  . feta cheese crumbles : 0.92


In [262]:
display_most_similar('rice noodles')

- Most similar to 'rice noodles'
  . organic low sodium chicken broth : 1.00
  . laksa paste : 0.97
  . good seasons italian dressing mix : 0.70
  . fillet red snapper : 0.64
  . egg noodles : 0.64
  . garland chrysanthemum : 0.63
  . hoisin sauce : 0.62
  . cabbage head : 0.62
  . beansprouts : 0.59
  . low sodium parmesan cheese : 0.58


In [263]:
display_most_similar('pork')

- Most similar to 'pork'
  . cabbage head : 1.00
  . kettle chips : 0.93
  . napa cabbage leaves : 0.82
  . pancit : 0.76
  . egg noodles : 0.74
  . good seasons italian dressing mix : 0.73
  . fillet red snapper : 0.69
  . garland chrysanthemum : 0.69
  . low sodium parmesan cheese : 0.67
  . reduced sodium smoked ham : 0.66


In [267]:
display_most_similar('vanilla')

- Most similar to 'vanilla'
  . poppy seed filling : 1.00
  . pandan essence : 0.99
  . raw milk : 0.97
  . sugar pearls : 0.95
  . hot cross buns : 0.93
  . cassis : 0.92
  . instant oats : 0.87
  . oil of orange : 0.68
  . refined sugar : 0.64
  . mint extract : 0.58


In [269]:
display_most_similar('whipped cream')

- Most similar to 'whipped cream'
  . hot fudge topping : 1.00
  . amarena cherries : 1.00
  . fast rising yeast : 1.00
  . gingersnap cookies : 1.00
  . meyer lemon peel : 1.00
  . ibarra : 1.00
  . blackstrap molasses : 0.99
  . chestnut purée : 0.98
  . gingersnap cookie crumbs : 0.98
  . marshmallow vodka : 0.95


In [268]:
display_most_similar('buffalo mozarella')

- Most similar to 'buffalo mozarella'
  . hot pepperoni : 1.00
  . stonefire italian thin pizza crust : 1.00
  . smoked gouda : 0.95
  . new york style panetini® toasts : 0.91
  . jarlsberg : 0.90
  . sweet yellow corn : 0.89
  . crumbled cornbread : 0.85
  . cheese ravioli : 0.82
  . bbq seasoning : 0.78
  . smoked cheddar cheese : 0.75
