In [1]:
import pandas as pd
import numpy as np

### Read the initial dataset with recipes

In [2]:
import json
from pprint import pprint

In [3]:
with open('epicurious/full_format_recipes.json') as f:
    recipes = json.load(f)

In [5]:
# remove empty recipes
length=len(recipes)
for i in range(length):    
    if recipes[i]=={}:
        del recipes[i]
        length+=-1

In [6]:
len(recipes)

20111

### Upload the ingredients from "What's cooking dataset" to extract ingredients from recipes


In [7]:
with open('cooking.json') as f:
    cooking = json.load(f)

In [8]:
#Stem ingredient names from "what's cooking dataset"

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("apples"))

apple


In [9]:
#@ Obtain the clean list of ingredients
ing=[]

for x in cooking:
    ing.extend(x['ingredients'])
    
ing=set(ing)

ingredients=[]
for x in ing:
    ingredients.append(lemmatizer.lemmatize(x))

In [10]:
ingredients=list(set(ingredients))

In [16]:
len(ingredients)

6714

### Extract ingredients from recipes and record them as "ingredients_short" in json file

In [12]:
def extract(x):
    a=[]
    for ing in ingredients:
        if ing in x.lower():
            a.append(ing)
    if len(a)>1:
        a=max(a, key=len)    
    elif len(a)==1:
        a=a[0]
    elif len(a)==0:
        a='no ingredient'
    return a

In [13]:
for i in range(len(recipes)):
    try:
        b=recipes[i]['ingredients']
        n=[]
        for bb in b:
            n.append(extract(bb))
        recipes[i]['ingredients_short']=n
    except:
        continue

In [None]:
for i  in range(len(recipes)):
    recipes[i]['categories']=[x.lower() for x in recipes[i]['categories']]

### Extract categories to review what's there besides food items

In [None]:
categories=[]

In [None]:
for i  in range(len(recipes)):
    a=[x for x in recipes[i]['categories'] if x not in new_ingredients]
    categories.extend(a)

In [None]:
categories=list(set(categories))

In [None]:
categories

### Create the new dataset with all ingredients and nutrition information

In [14]:
# Extract ingredients present in the Epicurious dataset
new_ingredients=[]
for x in recipes:
    try:
        new_ingredients.extend(x['ingredients_short'])
    except:
        continue

In [63]:
new_ingredients=list(set(new_ingredients))

len(new_ingredients)

3594

In [64]:
data=pd.DataFrame([[1 if new_ingredients[j] in recipes[i]["ingredients_short"] else 0 for j in range(len(new_ingredients))] 
                   for i in range(len(recipes))])

In [67]:
data.columns=new_ingredients

In [77]:
data['title']=pd.Series([x['title'] for x in recipes])

In [82]:
t=['title']
t.extend(data.columns[:-1])
data=data[t]

In [85]:
data.head()

Unnamed: 0,title,mint,mango nectar,rose water,atta,apricot nectar,vegetable juice,crushed peppermint candy,fino sherry,garlic sauce,...,ajwain,squid tube,mirin,cider,low-fat cream cheese,asakusa nori,pita rounds,rapid rise yeast,turkey giblet stock,goji berries
0,"Lentil, Apple, and Turkey Wrap",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Boudin Blanc Terrine with Red Onion Confit,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Potato and Fennel Soup Hodge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mahi-Mahi in Tomato Olive Sauce,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Spinach Noodle Casserole,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
# nutritional value
data['calories']=pd.Series([x['calories'] for x in recipes])
data['protein']=pd.Series([x['protein'] for x in recipes])
data['fat']=pd.Series([x['fat'] for x in recipes])
data['carbs']=(data['calories']-data['fat']*9-data['protein']*4)/4
data['sodium']=pd.Series([x['sodium'] for x in recipes])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [87]:
# of steps to create the dish (for complexity measure)
data['steps']=pd.Series([len(x['directions']) for x in recipes])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [99]:
# meal type
data['breakfast']=pd.Series([int('breakfast' in x['categories']) for x in recipes])
data['lunch']=pd.Series([int('lunch' in x['categories']) for x in recipes])
data['dinner']=pd.Series([int('dinner' in x['categories']) for x in recipes])
data['snack']=pd.Series([int('snack' in x['categories']) for x in recipes])
data['drink']=pd.Series([int(('drink' in x['categories'])|('drinks' in x['categories'])) for x in recipes])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [100]:
data.head()

Unnamed: 0,title,mint,mango nectar,rose water,atta,apricot nectar,vegetable juice,crushed peppermint candy,fino sherry,garlic sauce,...,calories,protein,carbs,sodium,steps,breakfast,lunch,dinner,snack,drink
0,"Lentil, Apple, and Turkey Wrap",0,0,0,0,0,0,0,0,0,...,426.0,30.0,60.75,559.0,3,0,0,0,0,0
1,Boudin Blanc Terrine with Red Onion Confit,0,0,0,0,0,0,0,0,0,...,403.0,18.0,31.0,1439.0,5,0,0,0,0,0
2,Potato and Fennel Soup Hodge,0,0,0,0,0,0,0,0,0,...,165.0,6.0,19.5,165.0,2,0,0,0,0,0
3,Mahi-Mahi in Tomato Olive Sauce,0,0,0,0,0,0,0,0,0,...,,,,,2,0,0,1,0,0
4,Spinach Noodle Casserole,0,0,0,0,0,0,0,0,0,...,547.0,20.0,44.75,452.0,1,0,0,0,0,0


### Add cuisine location information (scarse in the file)

In [None]:
countries=pd.read_csv("countries.csv", encoding = "ISO-8859-1")

In [None]:
countries=[x.lower() for x in countries['Countries'] if len(x)>1]

In [None]:
countries.remove('turkey')

In [None]:
countries

In [None]:
states=pd.read_csv("states.csv")

In [None]:
states=list(states['State Name'])
states=[x.lower() for x in states]

In [None]:
states

In [None]:
def get_location(x):
    s=set(states).intersection(x)
    c=set(countries).intersection(x)
    if len(s)>0: return "usa"
    elif len(c)>0: return c
    else: return np.nan
    

In [None]:
#data['location']=pd.Series([get_location(x['categories']) for x in recipes])

In [None]:
data.head()

In [None]:
sum(data.loc[1][1:3389])

In [None]:
len(recipes[1]['ingredients_short'])

In [None]:
len(recipes[1]['ingredients_short'])

In [None]:
[recipes[1]['ingredients'][i] for i in range(len(recipes[1]['ingredients'])) if recipes[1]['ingredients_short'][i]=='no ingredient']

In [None]:
data.head()

In [None]:
data.loc[1]['lettuce leaves']

In [None]:
sum(data.loc[1][1:3590])

### Save the new dataset as csv and recipes json files

In [None]:
data.to_csv("epi_data.csv")

In [None]:
with open('recipes.json', 'w') as outfile:
    json.dump(recipes, outfile)