# Predicting potential ingredients given partial recipes

In [1]:
# Import libraries
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
import re
%matplotlib inline

## Convert data to bag-of-words

Load datasets to notebook from json files, prepare data to convert to bag of words.

In [2]:
# Initialise empty lists
train_ingr    = [] # Lists to save set of ingredients for each recipe...

train_id      = [] # Lists to store recipe id for recipes in both

train_cuisine = [] # Training data contains additional information with cuisine

# Load json files as lists of dictionaries for each recipe
with open('data/train.json') as json_file:  # Load train data
    dict_train = json.load(json_file)

    
# Process the data to make it suitable for CountVectorizer    
for train_recipe in dict_train: # iterate over dictionaries of recipes
    
    # For each dictionary with a recipe, replace spaces between words from the same ingredient 
    # with an underscore '_', then join all the ingredients in each recipe in a 
    # long string where ingredients are separated by spaces ' '.
    concat_ingr_tr = ' '.join([word.replace(" ", "") for word in train_recipe['ingredients']])
    
    # Append the result from previous step to the list of recipes with ingredients
    train_ingr.append(concat_ingr_tr)
    
    # Append the recipe id to list of recipe id's
    train_id.append(train_recipe['id'])
    
    # Append the cuisine to list of recipe id's
    train_cuisine.append(train_recipe['cuisine'])
    
cuisine_dict = dict(zip(train_id, train_cuisine))   

print("\n*  Recipe dictionary: \n", dict_train[0])
    
# Display first three elements of test_ingr
print("\n*  Input for CountVectorizer: \n", train_ingr[0])


*  Recipe dictionary: 
 {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}

*  Input for CountVectorizer: 
 romainelettuce blackolives grapetomatoes garlic pepper purpleonion seasoning garbanzobeans fetacheesecrumbles


Convert to 'bag-of-ingredients' format.

In [3]:
# Initialise CountVectorizer object
vectorizer = CountVectorizer()

# Transform list of ingredients to bag-of-ingredients format, for train and 
# test sets, and store in a pandas data frame
X_train = pd.DataFrame(vectorizer.fit_transform(train_ingr).todense())

# Add column names, using keys from vectorizer dictionary
X_train.columns = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# Add recipe id
X_train.index = train_id

## Clean up strange recipes & ingredients

#### Explore the size of the datasets

In [4]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))

Training data size:
 *  39774 recipes
 *  6802 ingredients


#### Remove too rare/too common/nonsense ingredients

In [5]:
# Count occurrences of each ingredient.
ingr_count = X_train.sum(axis=0)

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)
    
num_elem = [hasNumbers(ingr) for ingr in X_train.columns] # Ingredients that contain a number.
leq_three = list(ingr_count <= 3)                         # Ingredients that appear three times or less.

removed_idx = [a or b for a, b in zip(num_elem, leq_three)] # Ingredients that meet both criteria.

# Sum number of ingredients that appear 3 times or less.
print('* ingredients removed for being numeric or appearing <= 3 times',sum(removed_idx))

# Display examples of infrequent ingredients.
print('\n* sample of infrequent ingredients\n',ingr_count[0:15])

# Drop ingredients with number or <4 occurences.
X_train.drop(X_train.columns[removed_idx], axis = 1, inplace = True)

# Remove other strange variables.
remove_vars = ['all', 'half', 'extra']
X_train.drop(columns = remove_vars, inplace = True)

# Recount occurrence of ingredients.
ingr_count = X_train.sum(axis=0)

* ingredients removed for being numeric or appearing <= 3 times 3075

* sample of infrequent ingredients
 10oz               5
14                 3
14oz               2
15oz               3
1inchthick         2
21                 2
25                 3
2to3lb             2
33                 6
40                12
5oz                3
7up                4
8ounczitipasta     1
95                 4
abalone            2
dtype: int64


#### Remove recipes with too many or too few ingredients

In [6]:
# Count ingredients per recipe.
recipe_count = X_train.sum(axis = 1)
print('Recipes with 2 ingredients or less:', sum(recipe_count <= 2))

not_too_few_idx = list(recipe_count > 2)
not_too_many_idx = list(recipe_count < 30)

# Remove recipes with <3 or >30 ingredients.
X_train = X_train.loc[[a and b for a, b in zip(not_too_few_idx, not_too_many_idx)]]

Recipes with 2 ingredients or less: 268


In [7]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))

Training data size:
 *  39449 recipes
 *  3724 ingredients


## Group equivalent ingredients in one variable
Following [this](https://pdfs.semanticscholar.org/3f63/269aa7910774e9386b1ffb340a9e8638c02d.pdf) paper: <br>
"From the list of ingredients, a dictionary of the 355 most common ingredients occurring in at least 120 recipes was hand curated, then each recipe’s ingredient list was filtered with these words so that ingredients such as ‘1/2 teaspoon ground cardamom’ were reduced to simple ingredient features such as ‘cardamom’. In the end, the ingredients for each recipe were represented in an R355 binary vector, where the element in index i is 1 if ingredient i is present in the recipe, and 0 if absent. Quantities of the ingredients used were not taken into account."

<font color='red'> 1. Drop ingredients immediately once they've matched a common ingredient? <br>
2. What when e.g. beef and beef broth are both common ingredients? Do it from long to short takes care of this case, but doing it from short to long takes care of for example chicken versus 'bonelessskinlesschickenbreasthalves'. <br>
3. Drop salt, water?</font>

In [8]:
n_common = 100 # Number of most common ingredients to keep.
ingr_count.sort_values(ascending = False, inplace = True) # Sort the list with ingredients by count.
common_ingredients = list(ingr_count[:n_common].index) # Save the most common ingredients.
noncommon_ingredients = [i for i in X_train.columns if not i in common_ingredients] # Save the noncommon ingredients.

In [9]:
# Loop over all common ingredients from longest string to shortest.
for common_ingr in sorted(common_ingredients, key = len, reverse = False):
    regex = re.compile('.*' + common_ingr + '.*')
    
    # Find the non-common ingredients matching this common ingredient.
    ingr_to_group = list(filter(regex.search, noncommon_ingredients))
    
    if ingr_to_group != None:
        
        # Combine equivalent ingredients in the common ingredient's column.
        X_train[common_ingr] = list(map(int, (X_train[ingr_to_group].sum(axis = 1) + X_train[common_ingr] >= 1)))
    
        # Drop the grouped non-common ingredients from the dataset.
#         X_train.drop(columns = ingr_to_group, inplace = True)
#         noncommon_ingredients = [i for i in X_train.columns if not i in common_ingredients]
        
X_train.drop(columns = noncommon_ingredients, inplace = True)

In [10]:
X_train.head()

Unnamed: 0,avocado,bakingpowder,bakingsoda,bayleaves,blackbeans,blackpepper,bonelessskinlesschickenbreasts,brownsugar,butter,buttermilk,...,tomatopaste,tomatosauce,unsaltedbutter,vanillaextract,vegetableoil,virginoliveoil,water,whitesugar,yellowonion,zucchini
10259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20130,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
13162,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [30]:
# Check new ingredient counts.
ingr_count2 = X_train.drop(columns='cuisine').sum(axis=0)
ingr_count2.sort_values(ascending = False, inplace = True)

## Add back cuisine information and save preprocessed data to CSV

In [11]:
X_train['cuisine'] = pd.Series(cuisine_dict)

In [12]:
X_train.to_csv('train_dataset.csv', sep = '\t')