# Predicting potential ingredients given partial recipes

In [1]:
# Import libraries
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

### Data preparation


Load datasets to notebook from json files, prepare data to convert to bag of words.

In [9]:
# Initialise empty lists
train_ingr    = [] # Lists to save set of ingredients for each recipe...

train_id      = [] # Lists to store recipe id for recipes in both

train_cuisine = [] # Training data contains additional information with cuisine

# Load json files as lists of dictionaries for each recipe
with open('data/train.json') as json_file:  # Load train data
    dict_train = json.load(json_file)

    
# Process the data to make it suitable for CountVectorizer    
for train_recipe in dict_train: # iterate over dictionaries of recipes
    
    # For each dictionary with a recipe, replace spaces between words from the same ingredient 
    # with an underscore '_', then join all the ingredients in each recipe in a 
    # long string where ingredients are separated by spaces ' '.
    concat_ingr_tr = ' '.join([word.replace(" ", "") for word in train_recipe['ingredients']])
    
    # Append the result from previous step to the list of recipes with ingredients
    train_ingr.append(concat_ingr_tr)
    
    # Append the recipe id to list of recipe id's
    train_id.append(train_recipe['id'])
    
    # Append the cuisine to list of recipe id's
    train_cuisine.append(train_recipe['cuisine'])
        
    

print("\n*  Recipe dictionary: \n",dict_train[0])
    
# Display first three elements of test_ingr
print("\n*  Input for CountVectorizer: \n",train_ingr[0])


*  Recipe dictionary: 
 {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}

*  Input for CountVectorizer: 
 romainelettuce blackolives grapetomatoes garlic pepper purpleonion seasoning garbanzobeans fetacheesecrumbles


Convert to 'bag-of-ingredients' format.

In [3]:
# Initialise CountVectorizer object
vectorizer = CountVectorizer()

# Transform list of ingredients to bag-of-ingredients format, for train and 
# test sets, and store in a pandas data frame
X_train = pd.DataFrame(vectorizer.fit_transform(train_ingr).todense())

# Add column names, using keys from vectorizer dictionary
X_train.columns = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# Add recipe id
X_train.index = train_id

Explore the size of the datasets

In [4]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))
print("\nTest data size:\n *  {} recipes\n *  {} ingredients".format(X_test.shape[0], X_test.shape[1]))

Training data size:
 *  39774 recipes
 *  6782 ingredients

Test data size:
 *  9944 recipes
 *  6782 ingredients


In [5]:
print("Total data size:\n *  {} recipes".format(X_train.shape[0] + X_test.shape[0]))

Total data size:
 *  49718 recipes


In [11]:
# Count occurrences of each ingredient
ingr_count = X_train.sum(axis=0)

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)
    
num_elem = [hasNumbers(ingr) for ingr in X_train.columns] # ingredients that contain a number
leq_three = list(ingr_count <= 3)                         # ingredients that appear three times or less

removed_idx = [a or b for a, b in zip(num_elem, leq_three)] # ingredients that meet both criteria


# Sum number of ingredients that appear 3 times or less
print('* ingredients removed for being numeric or appearing <= 3 times',sum(removed_idx))

# Display examples of infrequent ingredients
print('\n* sample of infrequent ingredients\n',ingr_count[0:15])

X_train.drop(X_train.columns[removed_idx], axis=1, inplace=True)

* ingredients removed for being numeric or appearing <= 3 times 0

* sample of infrequent ingredients
 aburaage                4
accentseasoning         6
achiote                 8
achiotepaste           11
achiotepowder           4
ackee                   9
acornsquash            18
actingbakingpowder     21
activedryyeast        377
addedblackbeans        18
addeddicedtomatoes     18
adobo                  15
adobosauce             94
adoboseasoning         10
adzukibeans             5
dtype: int64


In [12]:
def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)
    
indx = [hasNumbers(ingr) for ingr in X_train.columns]
X_train.columns[indx]

Index([], dtype='object')

In [13]:
# Count ingredients per recipe
recipe_count = X_train.sum(axis=1)
print('Recipes with 2 ingredients or less:',sum(recipe_count <= 2))

# Remove recipes with 2 ingredients or less
X_train = X_train.loc[recipe_count > 2]

Recipes with 2 ingredients or less: 255


In [14]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))
print("\nTest data size:\n *  {} recipes\n *  {} ingredients".format(X_test.shape[0], X_test.shape[1]))

Training data size:
 *  39519 recipes
 *  3717 ingredients

Test data size:
 *  9944 recipes
 *  6782 ingredients


In [15]:
X_train.to_csv('train_dataset.csv', sep='\t')