# Predicting potential ingredients given partial recipes

In [7]:
# Import libraries
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
%matplotlib inline

## Convert data to bag-of-words

**Load datasets to notebook from json files**

In [2]:
 def invalidChar(inputString):
        # Function to check if a character in a string is a number or a special character
        special_char = string.punctuation
        out = any(char.isdigit() or char in special_char for char in inputString)
        return out

**Prepare data to convert to bag of words**

In [3]:
# Load json files as lists of dictionaries for each recipe
with open('data/train.json') as json_file:  # Load train data
    dict_train = json.load(json_file)

In [202]:
# Initialise empty lists
train_ingr    = [] # Lists to save set of ingredients for each recipe...
train_id      = [] # Lists to store recipe id for recipes in both
train_cuisine = [] # Training data contains additional information with cuisine

list_of_lists = []

# Process the data to make it suitable for CountVectorizer    
for train_recipe in dict_train: # iterate over dictionaries of recipes
    
    # Clean ingredient names
    thisIngr = []
    for word in train_recipe['ingredients']:
        
        # Remove and upper case to lower case
        # Remove leading white space
        word = word.lower().lstrip()
        
        # Remove numbers and special characters from word
        for char in word:
            if not(char.islower() or char == ' '): word = word.replace(char,"")
        
        # Remove long spaces, oz, lb
        word     = word.replace("oz ","")
        word     = word.replace("lb ","")
        word     = word.replace("  ","")
        
        # Replace all spaces with "_" to keep separate words
        word     = word.replace(" ","_")
        
        # List of cleaned ingredients for this recipe
        thisIngr.append(word)
    
    # Create a list of lists of ingredients for each recipe (MAYBE NOT REQUIRED LATER)
    list_of_lists.append(thisIngr)
    
    # For each dictionary with a recipe, replace spaces between words from the same ingredient 
    # with an underscore '_', then join all the ingredients in each recipe in a 
    # long string where ingredients are separated by spaces ' '.
    concat_ingr_tr = ' '.join([word for word in thisIngr])
    
    # Append the result from previous step to the list of recipes with ingredients
    train_ingr.append(concat_ingr_tr)
    
    # Append the recipe id to list of recipe id's
    train_id.append(train_recipe['id'])
    
    # Append the cuisine to list of recipe id's
    train_cuisine.append(train_recipe['cuisine'])

# Save cuisine to dictionary of recipe ids 
cuisine_dict = dict(zip(train_id, train_cuisine))   

# Display first element of ingredient objects
print("\n*  Recipe dictionary: \n", dict_train[0])
print("\n*  Input for CountVectorizer: \n", train_ingr[0])


*  Recipe dictionary: 
 {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}

*  Input for CountVectorizer: 
 romaine_lettuce black_olives grape_tomatoes garlic pepper purple_onion seasoning garbanzo_beans feta_cheese_crumbles


**Convert to 'bag-of-ingredients' format.**

In [130]:
# Initialise CountVectorizer object
vectorizer = CountVectorizer()

# Transform list of ingredients to bag-of-ingredients format, for train and 
# test sets, and store in a pandas data frame
X_train = pd.DataFrame(vectorizer.fit_transform(train_ingr).todense())

# Add column names, using keys from vectorizer dictionary
X_train.columns = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# Add recipe id
X_train.index = train_id

## Clean up strange recipes & ingredients

In [132]:
# Remove underscore before ingredient
column_names = vectorizer.get_feature_names()

for name_idx, name in enumerate(column_names):
    # There could be more than one space!
    if name[0] == '_':
        X_train.drop(columns = name, axis = 1, inplace=True)

#### Explore the size of the datasets

In [133]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))

Training data size:
 *  39774 recipes
 *  6669 ingredients


#### Remove too rare/too common/nonsense ingredients

Perform a fist filter to reduce computational cost of a more sophisticated approach performed later

In [134]:
# Create a copy of the dataframe for cleaning
X_train_cl = X_train.copy(deep=True)

# Count occurrences of each ingredient
ingr_count = X_train_cl.sum(axis=0)

 # Ingredients that appear three times or less
not_many_ingr = list(ingr_count <= 3)                        

# Sum number of ingredients that appear 3 times or less.
print('* ingredients removed for being numeric or appearing <= 3 times',sum(not_many_ingr))

# Drop infrequent ingredients
X_train_cl.drop(X_train_cl.columns[not_many_ingr], axis = 1, inplace = True)

# Get index of ingredients of length less than 4 (with some exceptions)
accepted_short = [elem not in ['cod', 'pig','gin','ham','hen','ice','jam','oil','pig','rib','soy']
                  for elem in X_train_cl.columns]
short_ingr = np.array([len(elem) for elem in X_train_cl.columns]) <= 3
rm_short_name = [a and b for a, b in zip(accepted_short, short_ingr)]

# Drop ingredients of length less than 4 (with some exceptions)
X_train_cl.drop(X_train_cl.columns[rm_short_name], axis = 1, inplace = True)

# Sum number of ingredients of length less than 3.
print('* ingredients removed for being too short',sum(rm_short_name))

# New number of ingredients
print('* New number of ingredients',len(X_train_cl.columns))

# Recount occurrence of ingredients.
ingr_count = X_train_cl.sum(axis=0)

* ingredients removed for being numeric or appearing <= 3 times 3003
* ingredients removed for being too short 10
* New number of ingredients 3656


## Combine ingredients

In [135]:
def longest_common_substring(s1, s2, thresh=3):
    
    # Function to find the longest common substring between 
    # two strings and which is longer than a threshold
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            
            # I don't want words starting with "_"
            if s1[x - 1] == s2[y - 1] and s1[x - 1] != "_": 
                m[x][y] = m[x - 1][y - 1] + 1
                
                # I don't want words with "_" in between;
                # single words or parts of a word only
                if m[x][y] > longest and "_" not in s1[x - m[x][y]: x]:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    out = s1[x_longest - longest: x_longest]
    
    # Clean underscore after the word
    if out != "" and out[-1] == "_": out = out[:-1]
        
    if len(out) > thresh:
        return out
    else:
        return ""

def uniqueString(string_list):
    # Function to clean a list of strings of strings that are contained in other strings
    string_list.sort(key=lambda s: len(s), reverse=True)
    out = []
    for s in string_list:
        if not any([s in o for o in out]):
            out.append(s)
    return out    
    
    
# Tetst the function    
longest_common_substring('egar_','cider_vinegar')

'egar'

**Generate substrings**

Find, for each ingredient a list of substrings that were found in common with other ingredients

In [136]:
# Intialise empty lists
vect_substr = [] # List for vectorizer
list_substr = [] # List for dictionary old vs new

# For each ingredient in our bag of ingredients
for ingr1 in X_train_cl.columns:
    
    # Initialise an empty list to store list of ingredients for one recipe (overwritten every time)
    common_subs = []
    
    # For each ingredient in our bag of ingredients (we now have a pair (ingr1, ingr2))
    for ingr2 in X_train_cl.columns:
        
        # Find the longest substring they have in common
        match = longest_common_substring(ingr1, ingr2)
        
        # If there is a substring longer than 4 char, single word and isn't already included, append
        if (match != "") and (match not in common_subs):
            common_subs.append(match)

    # Keep only unique substrings (not strings contained in others)
    common_subs = uniqueString(common_subs)
    
    # From list of strings to a single string with spaces
    vect_substr.append(' '.join([word for word in common_subs]))
    list_substr.append(common_subs) # List of lists
    
# Create a dictionary with the ingredient and the list of substring found for it
ingr_substr_dict = dict(zip(X_train_cl.columns, list_substr))   

**Create a new bag of words object with ingredients vs substrings**

In [137]:
# Sample input for vectorizer
vect_substr[0:5]

['abura', 'seasoning cent', 'achiote', 'achiote paste', 'achiote powder']

In [138]:
# Initialise CountVectorizer object
vect_substr_bog = CountVectorizer()

# Transform list of ingredients to bag-of-ingredients format, for train and 
# test sets, and store in a pandas data frame
substr_df = pd.DataFrame(vect_substr_bog.fit_transform(vect_substr).todense())

# Add column names, using keys from vectorizer dictionary
substr_df.columns = sorted(vect_substr_bog.vocabulary_, 
                           key=vect_substr_bog.vocabulary_.get)

# Add recipe id
substr_df.index = X_train_cl.columns

# Display first few rows of the generated dataframe
substr_df.head()

Unnamed: 0,abura,achiote,ackee,acle,acor,active,added,adobo,adzuki,agar,...,yoghurt,yogurt,yolk,yolks,yukon,yuzu,zest,zinfandel,ziti,zucchini
abura_age,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accent_seasoning,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
achiote,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
achiote_paste,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
achiote_powder,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Compute popularity of substrings**

To determine which part of the ingredient name to keep

In [139]:
# Compute popularity of the substring among both ingredients and recipes, store in a dataframe
substr_freq_names  = substr_df.sum(axis=0) # Count frequency of this substring in names
substr_freq_ingr   = []                    # Count frequency of this substring in ingredients
substr_freq_recipe = []                    # Count frequency of this substring in recipes

for substr in substr_df.columns:
    substr_freq_ingr.append(sum([substr in ingr for ingr in X_train_cl.columns]))
    
for substr in substr_df.columns:
    substr_freq_recipe.append(sum([substr in recipe for recipe in train_ingr]))

# Store in a dataframe
subs_freq_df = pd.DataFrame(np.column_stack([substr_freq_names, substr_freq_ingr, substr_freq_recipe]), 
                               columns=['freq_ingr', 'freq_names', 'freq_recipes'])
subs_freq_df.index = substr_freq_names.index

# Display first rows
subs_freq_df.head()

Unnamed: 0,freq_ingr,freq_names,freq_recipes
abura,1,1,4
achiote,3,3,23
ackee,1,1,9
acle,1,4,31
acor,1,2,28


In [178]:
# Create array on which to modify the ingredient names
new_columns = np.array(X_train_cl.columns)

for elem_idx, elem in enumerate(X_train_cl.columns):
    
    # Old list of substrings
    previous_list = np.array(ingr_substr_dict[elem])
    
    # Get 3 frequencies for all substrings in the ingredient
    counts = subs_freq_df.loc[ingr_substr_dict[elem]]
    
    # Store each frequency separately, but keeping substrings together
    ingr_count   = counts['freq_ingr'].values
    name_count   = counts['freq_names'].values
    recipe_count = counts['freq_recipes'].values
    
    # indeces of substrings that are popular enough
    many_substr = np.logical_or(ingr_count + name_count > 50, recipe_count > 1000)
    
    # Keep only these substrings
    new_list = list(previous_list[many_substr])
    
    # Join them in a single word to get a new ingredient name
    new_list.sort(key=len, reverse=True)
    if new_list != []: 
        new_columns[elem_idx] = '_'.join([word for word in new_list])

In [179]:
print("Ingredients combined:",X_train_cl.shape[1] - len(np.unique(new_columns)))
print("Unique ingredients now:", len(np.unique(new_columns)))

Ingredients combined: 1494
Unique ingredients now: 2162


**Combine columns in dataframe**

In [182]:
# Replace old columns
old_replaced = [] # List to store columns to be dropped

for old, new in zip(old_columns, new_columns):
    if new not in X_train_cl_copy.columns:
        # Create column new name
        X_train_cl_copy[new] = X_train_cl_copy[old] 
        old_replaced.append(old)
    elif new != old:
        # Combine old and new column in one
        X_train_cl_copy[new] = np.minimum(1, np.array(X_train_cl_copy[old] + X_train_cl_copy[new]))
        old_replaced.append(old)

# Drop old columns        
X_train_cl_copy.drop(columns = old_replaced, axis=1, inplace=True)

**Remove infrequent ingredients**

In [185]:
# Compute ingredient frequencies
ingr_count = X_train_cl_copy.sum(axis=0)

# Ingredients that appear three times or less
not_many_ingr = list(ingr_count <= 200)                        

# Sum number of ingredients that appear 3 times or less.
print('* ingredients removed for being numeric or appearing <= 50 times',sum(not_many_ingr))

# Drop infrequent ingredients
X_train_cl_copy.drop(X_train_cl_copy.columns[not_many_ingr], axis = 1, inplace = True)

* ingredients removed for being numeric or appearing <= 50 times 157


In [196]:
# Display remanining ingredients
remaining_ingr = list(X_train_cl_copy.columns)
remaining_ingr.sort()
remaining_ingr

# Manually remove irrelevant inredients
irrelevant = ['water', 'salt', 'white', 'chopped_fresh', 'chopped', 'dried', 'fresh','frozen',
             'green', 'ground','halfhalf','large','leaves','paste','powder','sauce','sliced','star',
             'yellow']

X_train_cl_copy.drop(columns = irrelevant, axis = 1, inplace = True)

#### Remove recipes with too many or too few ingredients

In [197]:
# Count ingredients per recipe.
recipe_count = X_train_cl_copy.sum(axis = 1)
print('Recipes with 2 ingredients or less:', sum(recipe_count <= 2))

not_too_few_idx = list(recipe_count > 2)
not_too_many_idx = list(recipe_count < 30)

# Remove recipes with <3 or >30 ingredients.
X_train_cl_copy = X_train_cl_copy.loc[[a and b for a, b in zip(not_too_few_idx, not_too_many_idx)]]

Recipes with 2 ingredients or less: 894


In [198]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train_cl_copy.shape[0], 
                                                                       X_train_cl_copy.shape[1]))

Training data size:
 *  37704 recipes
 *  311 ingredients


## Appendix
### Other methods tried to combine ingredients

Following [this](https://pdfs.semanticscholar.org/3f63/269aa7910774e9386b1ffb340a9e8638c02d.pdf) paper: <br>
"From the list of ingredients, a dictionary of the 355 most common ingredients occurring in at least 120 recipes was hand curated, then each recipe’s ingredient list was filtered with these words so that ingredients such as ‘1/2 teaspoon ground cardamom’ were reduced to simple ingredient features such as ‘cardamom’. In the end, the ingredients for each recipe were represented in an R355 binary vector, where the element in index i is 1 if ingredient i is present in the recipe, and 0 if absent. Quantities of the ingredients used were not taken into account."

In [None]:
n_common = 100 # Number of most common ingredients to keep.
ingr_count.sort_values(ascending = False, inplace = True) # Sort the list with ingredients by count.
common_ingredients = list(ingr_count[:n_common].index) # Save the most common ingredients.
noncommon_ingredients = [i for i in X_train.columns if not i in common_ingredients] # Save the noncommon ingredients.

In [None]:
# Loop over all common ingredients from longest string to shortest.
for common_ingr in sorted(common_ingredients, key = len, reverse = False):
    regex = re.compile('.*' + common_ingr + '.*')
    
    # Find the non-common ingredients matching this common ingredient.
    ingr_to_group = list(filter(regex.search, noncommon_ingredients))
    
    if ingr_to_group != None:
        
        # Combine equivalent ingredients in the common ingredient's column.
        X_train[common_ingr] = list(map(int, (X_train[ingr_to_group].sum(axis = 1) + X_train[common_ingr] >= 1)))
    
        # Drop the grouped non-common ingredients from the dataset.
#         X_train.drop(columns = ingr_to_group, inplace = True)
#         noncommon_ingredients = [i for i in X_train.columns if not i in common_ingredients]
        
X_train.drop(columns = noncommon_ingredients, inplace = True)

In [None]:
# Check new ingredient counts.
ingr_count2 = X_train.drop(columns='cuisine').sum(axis=0)
ingr_count2.sort_values(ascending = False, inplace = True)

## Add back cuisine information and save preprocessed data to CSV

In [211]:
X_train_cl_copy['cuisine'] = [cuisine_dict[key] for key in X_train_cl_copy.index]
X_train_cl_copy.head()

Unnamed: 0,active_dry_yeast,allpurpose_flour,asparagus,avocado,bacon,baguette,baking_soda,bananas,basil,beans,...,vegetable,potatoes_sweet,vegetable_cooking_spray,vinegar_white,onion_white,pepper_white,vinegar_white_wine,flour_wheat_whole,pepper_yellow_bell,cuisine
10259,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,greek
25693,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,southern_us
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,filipino
13162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
6602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,jamaican


In [212]:
X_train_cl_copy.to_csv('train_dataset.csv', sep = '\t')