# Predicting potential ingredients given partial recipes

In [23]:
# Import libraries
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

### Data preparation


Load datasets to notebook from json files, prepare data to convert to bag of words.

In [76]:
# Initialise empty lists
train_ingr    = [] # Lists to save set of ingredients for each recipe...
test_ingr     = [] # ... for training and test sets

train_id      = [] # Lists to store recipe id for recipes in both
test_id       = [] # ... training and test sets

train_cuisine = [] # Training data contains additional information with cuisine


# Load json files as lists of dictionaries for each recipe
with open('data/train.json') as json_file:  # Load train data
    dict_train = json.load(json_file)
with open('data/test.json') as json_file:   # Load test data
    dict_test = json.load(json_file)

    
# Process the data to make it suitable for CountVectorizer    
for train_recipe in dict_train: # iterate over dictionaries of recipes
    
    # For each dictionary with a recipe, replace spaces between words from the same ingredient 
    # with an underscore '_', then join all the ingredients in each recipe in a 
    # long string where ingredients are separated by spaces ' '.
    concat_ingr_tr = ' '.join([word.replace(" ", "") for word in train_recipe['ingredients']])
    
    # Append the result from previous step to the list of recipes with ingredients
    train_ingr.append(concat_ingr_tr)
    
    # Append the recipe id to list of recipe id's
    train_id.append(train_recipe['id'])
    
    # Append the cuisine to list of recipe id's
    train_cuisine.append(train_recipe['cuisine'])
        
        
# Repeat above process for test data
for test_recipe in dict_test: # iterate over dictionaries of recipes
    concat_ingr_ts = ' '.join([word.replace(" ", "_") for word in test_recipe['ingredients']])
    test_ingr.append(concat_ingr_ts)  
    test_id.append(test_recipe['id'])

print("\n*  Recipe dictionary: \n",dict_train[0])
    
# Display first three elements of test_ingr
print("\n*  Input for CountVectorizer: \n",train_ingr[0])


*  Recipe dictionary: 
 {'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}

*  Input for CountVectorizer: 
 romainelettuce blackolives grapetomatoes garlic pepper purpleonion seasoning garbanzobeans fetacheesecrumbles


Convert to 'bag-of-ingredients' format.

In [198]:
# Initialise CountVectorizer object
vectorizer = CountVectorizer()

# Transform list of ingredients to bag-of-ingredients format, for train and 
# test sets, and store in a pandas data frame
X_train = pd.DataFrame(vectorizer.fit_transform(train_ingr).todense())
X_test  = pd.DataFrame(vectorizer.transform(test_ingr).todense())

# Add column names, using keys from vectorizer dictionary
X_train.columns = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)
X_test.columns = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# Add recipe id
X_train.index = train_id
X_test.index  = test_id

Explore the size of the datasets

In [199]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))
print("\nTest data size:\n *  {} recipes\n *  {} ingredients".format(X_test.shape[0], X_test.shape[1]))

Training data size:
 *  39774 recipes
 *  6782 ingredients

Test data size:
 *  9944 recipes
 *  6782 ingredients


In [200]:
print("Total data size:\n *  {} recipes".format(X_train.shape[0] + X_test.shape[0]))

Total data size:
 *  49718 recipes


In [201]:
# Count occurrences of each ingredient
ingr_count = X_train.sum(axis=0)

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)
    
num_elem = [hasNumbers(ingr) for ingr in X_train.columns] # ingredients that contain a number
leq_three = list(ingr_count <= 3)                         # ingredients that appear three times or less

removed_idx = [a or b for a, b in zip(num_elem, leq_three)] # ingredients that meet both criteria


# Sum number of ingredients that appear 3 times or less
print('* ingredients removed for being numeric or appearing <= 3 times',sum(removed_idx))

# Display examples of infrequent ingredients
print('\n* sample of infrequent ingredients\n',ingr_count[0:15])

X_train.drop(X_train.columns[removed_idx], axis=1, inplace=True)

* ingredients removed for being numeric or appearing <= 3 times 3065

* sample of infrequent ingredients
 10oz               5
14                 3
14oz               2
15oz               3
1inchthick         2
21                 2
25                 3
2to3lb             2
33                 6
40                12
5oz                3
7up                4
8ounczitipasta     1
95                 4
abalone            2
dtype: int64


In [202]:
# Count ingredients per recipe
recipe_count = X_train.sum(axis=1)
print('Recipes with 2 ingredients or less:',sum(recipe_count <= 2))

# Remove recipes with 2 ingredients or less
X_train = X_train.loc[recipe_count > 2]

Recipes with 2 ingredients or less: 255


In [203]:
print("Training data size:\n *  {} recipes\n *  {} ingredients".format(X_train.shape[0], X_train.shape[1]))
print("\nTest data size:\n *  {} recipes\n *  {} ingredients".format(X_test.shape[0], X_test.shape[1]))

Training data size:
 *  39519 recipes
 *  3717 ingredients

Test data size:
 *  9944 recipes
 *  6782 ingredients
