# Cook It Up with the Fantastic Four

In [45]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
import plotly.graph_objs as go
from IPython.display import IFrame
#print plotly.__version__ 
#plotly.offline.init_notebook_mode() # run at the start of every notebook


In [28]:
full_df = pd.read_json("train.json")
df_sample = full_df[5:10]
table = FF.create_table(df_sample)
py.iplot(table, filename='pandas_table')

# Data Collection

In [10]:
#from IPython.display import IFrame
#IFrame('http://stackoverflow.org', width=700, height=350)

from IPython.display import display, HTML, Javascript
HTML('<iframe src=http://en.mobile.wikipedia.org/?useformat=mobile width=700 height=350 seamless></iframe>')

### Train Data

In [37]:
from IPython.display import HTML, display

display(HTML('http://stackoverflow.org'), metadata=dict(isolated=True))

0,1,2,3
[here],[tarball],[@W3C],HTML v4.01
[here],[tarball],[@W3C],Cascading Stylesheets v2.1 (CSS2)

0,1,2,3
[here],[tarball],,"Old Netscape JS v1.3 Reference, including browser objects"
,,[@Mozilla],Core Javascript 1.5 Reference
,,[@Mozilla],Gecko DOM Reference
[here],[zip],[@W3C],W3C DOM Level 2 Core

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
FFF  FFF,CCC  CCC,999  999,666  666,333  333,000  000,FFC  C00,FF9  900,FF6  600,FF3  300,,,,,,
99C  C00,,,,,CC9  900,FFC  C33,FFC  C66,FF9  966,FF6  633,CC3  300,,,,,CC0  033
CCF  F00,CCF  F33,333  300,666  600,999  900,CCC  C00,FFF  F00,CC9  933,CC6  633,330  000,660  000,990  000,CC0  000,FF0  000,FF3  366,FF0  033
99F  F00,CCF  F66,99C  C33,666  633,999  933,CCC  C33,FFF  F33,996  600,993  300,663  333,993  333,CC3  333,FF3  333,CC3  366,FF6  699,FF0  066
66F  F00,99F  F66,66C  C33,669  900,999  966,CCC  C66,FFF  F66,996  633,663  300,996  666,CC6  666,FF6  666,990  033,CC3  399,FF6  6CC,FF0  099
33F  F00,66F  F33,339  900,66C  C00,99F  F33,CCC  C99,FFF  F99,CC9  966,CC6  600,CC9  999,FF9  999,FF3  399,CC0  066,990  066,FF3  3CC,FF0  0CC
00C  C00,33C  C00,336  600,669  933,99C  C66,CCF  F99,FFF  FCC,FFC  C99,FF9  933,FFC  CCC,FF9  9CC,CC6  699,993  366,660  033,CC0  099,330  033
33C  C33,66C  C66,00F  F00,33F  F33,66F  F66,99F  F99,CCF  FCC,,,,CC9  9CC,996  699,993  399,990  099,663  366,660  066
006  600,336  633,009  900,339  933,669  966,99C  C99,,,,FFC  CFF,FF9  9FF,FF6  6FF,FF3  3FF,FF0  0FF,CC6  6CC,CC3  3CC
003  300,00C  C33,006  633,339  966,66C  C99,99F  FCC,CCF  FFF,339  9FF,99C  CFF,CCC  CFF,CC9  9FF,996  6CC,663  399,330  066,990  0CC,CC0  0CC

0,1,2,3
[here],[tarball],[@PHP.net],PHP v5.2.8

0,1,2,3
[here],[tarball],[@python.org],Python v2.6.1

0,1,2,3
[here],[tarball],,Java 1.4.1 API
[here],[tarball],[@Sun],Java 6 API

0,1,2,3
[here],[tarball],[@SGI],STL API

0,1,2
[here],[@IETF],RFC-1945: Hypertext Transfer Protocol v1.0
[here],[@IETF],RFC-2616: Hypertext Transfer Protocol v1.1
[here],[@IETF],RFC-2617: HTTP Basic And Digest Authentication

0,1,2
[here],[@IETF],RFC-2045: MIME v1.0
[here],[@IETF],RFC-2046: MIME Media Types
[here],[@IETF],RFC-2387: MIME Multipart Related

0,1,2
[here],[@W3C],XSL Transforms (XSLT)
[here],[@W3C],XPath


### Test Data

In [29]:
test_df = pd.read_json("test.json")
df_sample = test_df[0:5]
table = FF.create_table(df_sample)
py.iplot(table, filename='pandas_table')

## Cleaning Data

In [32]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from collections import Counter
stemmer = WordNetLemmatizer()

In [33]:
def clean_recipe(recipe):
    # To lowercase
    recipe = [ i for i in recipe ]

    # Remove some special characters
    # Individuals replace have a very good performance

    def replacing(i):
        i = i.replace('&', '').replace('(', '').replace(')','')
        i = i.replace('\'', '').replace('\\', '').replace(',','')
        i = i.replace('.', '').replace('%', '').replace('/','')
        i = i.replace('"', '')
        
        return i
    
    # Replacing characters
    recipe = [ replacing(i) for i in recipe ]
    
    # Remove digits
    recipe = [ i for i in recipe if not i.isdigit() ]
    
    # Stem ingredients
    recipe = [ stemmer.lemmatize(i) for i in recipe ]
    
    return recipe

In [34]:
%%time
full_df['ingredients'] = full_df.ingredients.apply(lambda x: clean_recipe(x))

CPU times: user 9.21 s, sys: 186 ms, total: 9.39 s
Wall time: 9.63 s


##  Data Breakdown

### Cuisine Distribution in Train Set

In [44]:
from collections import Counter

# Find Cuisine Distribution
countdict = Counter(full_df.cuisine)
cuisines = countdict.keys()
cuisinescount = countdict.values()

# Plot Cuisine Distribution Bar Chart
data = [
    go.Bar(
        y=cuisinescount,
        x=cuisines,    
    )
]
layout = dict(
    title='Number of Recipes per Cuisine for Train Data',
)
fig = go.Figure(data=data , layout=layout)


# Creating Pie Chart
fig2 = {
    'data': [{'labels': cuisines,
              'values': cuisinescount,
              'type': 'pie'}],
    'layout': {'title': 'Number of Recipes per Cuisine for Train Data'}
}

# Plot Global Cuisine Distribution

myList=[]

for c in cuisines: 
    if c == 'brazilian':
        myList.append('BRA')
    elif c == 'british':
        myList.append('GBR')
    elif c == 'cajun_creole':
        myList.append('USA')
    elif c == 'chinese':
        myList.append('CHN')
    elif c == 'filipino':
        myList.append('PHL')
    elif c == 'french':
        myList.append('FRA')
    elif c == 'greek':
        myList.append('GRC')
    elif c == 'indian':
        myList.append('IND')
    elif c == 'irish':
        myList.append('IRL')
    elif c == 'italian':
        myList.append('ITA')
    elif c == 'jamaican':
        myList.append('JAM')
    elif c == 'japanese':
        myList.append('JPN')
    elif c == 'korean':
        myList.append('KOR')
    elif c == 'mexican':
        myList.append('MEX')
    elif c == 'moroccan':
        myList.append('MAR')
    elif c == 'russian':
        myList.append('RUS')
    elif c == 'southern_us':
        myList.append('USA')
    elif c == 'spanish':
        myList.append('ESP')
    elif c == 'thai':
        myList.append('THA')
    elif c == 'vietnamese':
        myList.append('VNM')

data3 = [ dict(
        type = 'choropleth',
        locations = myList,
        z = cuisinescount,
        text = cuisines,
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = True,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            )
        ),
        colorbar = dict(
            autotick = False,
            tickprefix = '',
            title = 'Cuisine Count'
        ),
    ) ]

layout3 = dict(
    title = 'Global Cuisine Distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
fig3 = dict( data=data3, layout=layout3 )

iplot(fig)
iplot(fig2)
iplot(fig3, validate=False)

### Top 10 Most Used Ingredients in Train Set

In [36]:
%%time

# Find Ingredient Distribution
recipe_ingredient = [Counter(clean_recipe(recipe)) for recipe in full_df.ingredients]
ingredient_distribution = sum(recipe_ingredient, Counter())

CPU times: user 4min 26s, sys: 4.11 s, total: 4min 30s
Wall time: 4min 29s


In [41]:
%%time

ingredient_fig = pd.DataFrame(ingredient_distribution, index=[0]).transpose()[0].sort(ascending=False, inplace=False)
table = FF.create_table(pd.DataFrame({ 'Ingredients' :ingredient_fig[:20].keys(), 'Count' : ingredient_fig[:20].values}))
# Find Cuisine Distribution
ingredients = ingredient_fig[:20].keys()
ingredientscount = ingredient_fig[:20].values
ingredients = ingredients[::-1]
ingredientscount = ingredientscount[::-1]

## Plot Cuisine Distribution
data = [
    go.Bar(
        x=ingredientscount,
        y=ingredients,
        orientation = 'h',
        xaxis='x2', yaxis='y2'
    )
]
table['data'].extend(data)

# Edit layout for subplots
table.layout.xaxis.update({'domain': [0, .3]})
table.layout.xaxis2.update({'domain': [0.45, 1.]})
# The graph's yaxis MUST BE anchored to the graph's xaxis
table.layout.yaxis2.update({'anchor': 'x2'})
# Update the margins to add a title and see graph x-labels. 
table.layout.margin.update({'t':50, 'b':100})
table.layout.update({'title': 'Top 10 Most Used Ingredients in Train Set'})

iplot(table)

CPU times: user 454 ms, sys: 35.2 ms, total: 490 ms
Wall time: 472 ms


### Top 10 Most Used Ingredients in Test Set

In [42]:
%%time
recipe_ingredienttest = [Counter(clean_recipe(recipe)) for recipe in test_df.ingredients]
ingredient_distributiontest = sum(recipe_ingredienttest, Counter())

CPU times: user 36.4 s, sys: 587 ms, total: 37 s
Wall time: 37 s


In [43]:
ingredient_figtest = pd.DataFrame(ingredient_distributiontest, index=[0]).transpose()[0].sort(ascending=False, inplace=False)
table = FF.create_table(pd.DataFrame({ 'Ingredients' :ingredient_figtest[:20].keys(), 'Count' : ingredient_figtest[:20].values}))

ingredientstest = ingredient_figtest[:20].keys()
ingredientscounttest = ingredient_figtest[:20].values

ingredientstest = ingredientstest[::-1]
ingredientscounttest = ingredientscounttest[::-1]

data = [
    go.Bar(
        x=ingredientscounttest,
        y=ingredientstest,
        orientation = 'h',
        xaxis='x2', yaxis='y2'
    )
]
table['data'].extend(data)

# Edit layout for subplots
table.layout.xaxis.update({'domain': [0, .3]})
table.layout.xaxis2.update({'domain': [0.45, 1.]})
# The graph's yaxis MUST BE anchored to the graph's xaxis
table.layout.yaxis2.update({'anchor': 'x2'})
# Update the margins to add a title and see graph x-labels. 
table.layout.margin.update({'t':50, 'b':100})
table.layout.update({'title': 'Top 10 Most Used Ingredients in Test Set'})

iplot(table)

### Ingredients Design Matrix

In [34]:
%%time

# The number of times each ingredient is used is stored in the 'sumbags' dictionary
bags_of_words = [ Counter(clean_recipe(recipe)) for recipe in full_df.ingredients ]
sumbags = sum(bags_of_words, Counter())

CPU times: user 4min 45s, sys: 4.96 s, total: 4min 50s
Wall time: 4min 49s


In [35]:
%%time

ingredientdf = full_df

for ingredient in list(sumbags.keys()):
    ingredientdf[ingredient] = ingredientdf.ingredients.apply(lambda x: ingredient in x)
#cuisine = ingredientdf.drop(['ingredients', 'id','index'], axis=1).groupby('cuisine')
cuisine = ingredientdf.drop(['ingredients', 'id'], axis=1).groupby('cuisine')
cuisine = cuisine.aggregate(np.sum)


CPU times: user 5min 3s, sys: 12.6 s, total: 5min 16s
Wall time: 5min 17s


In [36]:
cuisine.head()

Unnamed: 0_level_0,low-sodium fat-free chicken broth,sweetened coconut,baking chocolate,egg roll wrappers,bottled low sodium salsa,vegan parmesan cheese,clam sauce,sushi nori,broiler,jalapeno chilies,...,chioggia,red leaf lettuce,margarita salt,low sodium canned chicken broth,progresso reduced sodium chicken broth,seedless raspberry jam,steamed white rice,black radish,cinnamon ice cream,lower sodium beef broth
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
brazilian,0,1,0,0,0,0,0,0,0,19,...,0,0,0,0,0,0,0,0,0,0
british,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
cajun_creole,2,0,0,0,0,0,0,0,0,49,...,0,0,0,0,1,0,5,0,0,0
chinese,1,1,0,38,0,0,0,0,0,20,...,0,1,0,0,0,0,15,0,0,0
filipino,0,0,0,3,0,0,0,0,0,11,...,0,0,0,0,0,0,2,0,0,0


### Ingredients Distribution Per Cuisine

In [37]:
%%time
# Find Cuisine Distribution
ingredients = ingredient_fig[:10].keys()
ingredientscount = ingredient_fig[:10].values

CPU times: user 283 µs, sys: 45 µs, total: 328 µs
Wall time: 303 µs


In [38]:
%%time
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go

nametuple = ();
for i, c in enumerate(list(cuisine.index)):
    nametuple = nametuple + (c,)

print nametuple

fig = tools.make_subplots(rows=10, cols=2, subplot_titles = nametuple)

for i, c in enumerate(list(cuisine.index)):    
    trace0 = go.Bar(
            x = cuisine.loc[c].sort(inplace=False, ascending=False)[:10].values,
            y = cuisine.loc[c].sort(inplace=False, ascending=False)[:10].keys(),
            orientation = 'h'
        )
    if i == 0:
        fig.append_trace(trace0, 1, 1)
    elif i == 1:
        fig.append_trace(trace0, 1, 2)        
    elif i == 2:
        fig.append_trace(trace0, 2, 1)        
    elif i == 3:
        fig.append_trace(trace0, 2, 2)        
    elif i == 4:
        fig.append_trace(trace0, 3, 1)       
    elif i == 5:
        fig.append_trace(trace0, 3, 2)       
    elif i == 6:
        fig.append_trace(trace0, 4, 1)       
    elif i == 7:
        fig.append_trace(trace0, 4, 2)
    elif i == 8:
        fig.append_trace(trace0, 5, 1)
    elif i == 9:
        fig.append_trace(trace0, 5, 2)        
    elif i == 10:
        fig.append_trace(trace0, 6, 1)        
    elif i == 11:
        fig.append_trace(trace0, 6, 2)        
    elif i == 12:
        fig.append_trace(trace0, 7, 1)       
    elif i == 13:
        fig.append_trace(trace0, 7, 2)       
    elif i == 14:
        fig.append_trace(trace0, 8, 1)       
    elif i == 15:
        fig.append_trace(trace0, 8, 2)
    elif i == 16:
        fig.append_trace(trace0, 9, 1)        
    elif i == 17:
        fig.append_trace(trace0, 9, 2)       
    elif i == 18:
        fig.append_trace(trace0, 10, 1)       
    elif i == 19:
        fig.append_trace(trace0, 10, 2)       

fig['layout'].update(height=2800,
                     title='Most Used Ingredients Per Cuisine')
iplot(fig)

(u'brazilian', u'british', u'cajun_creole', u'chinese', u'filipino', u'french', u'greek', u'indian', u'irish', u'italian', u'jamaican', u'japanese', u'korean', u'mexican', u'moroccan', u'russian', u'southern_us', u'spanish', u'thai', u'vietnamese')
This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]   
[ (2,1) x3,y3 ]     [ (2,2) x4,y4 ]   
[ (3,1) x5,y5 ]     [ (3,2) x6,y6 ]   
[ (4,1) x7,y7 ]     [ (4,2) x8,y8 ]   
[ (5,1) x9,y9 ]     [ (5,2) x10,y10 ] 
[ (6,1) x11,y11 ]   [ (6,2) x12,y12 ] 
[ (7,1) x13,y13 ]   [ (7,2) x14,y14 ] 
[ (8,1) x15,y15 ]   [ (8,2) x16,y16 ] 
[ (9,1) x17,y17 ]   [ (9,2) x18,y18 ] 
[ (10,1) x19,y19 ]  [ (10,2) x20,y20 ]



CPU times: user 427 ms, sys: 61.7 ms, total: 489 ms
Wall time: 451 ms


In [151]:
for i in ingredients:
    ingredients_all.append(i)    
cuisinefeatures=np.unique(ingredients_all) #create a feature list of unique ingrediants.

In [164]:
data = []

#for k, v in enumerate(cuisinefeatures):  
#        print v
#        trace0 = go.Bar(
#            x = list(cuisine.index),
#            y = cuisine.loc[c].sort(inplace=False, ascending=False)[:10].keys(),
#            name= cuisine.loc[c].sort(inplace=False, ascending=False)[:10].values
#        )
     
for col in cuisinefeatures:
    if col != 'cuisine':
        total = np.sum(cuisine[col].sort(inplace=False, ascending=False)[:20].values)
        data.append(
            go.Bar( 
                    x = list(cuisine.index),
                    y = cuisine[col].sort(inplace=False, ascending=False)[:20].values/total * 100,
                    name = col ) )

layout = go.Layout(
    title='Most Common Ingredients Distribution Per Cuisine',
    barmode='stack',
    height = 1000
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)


In [306]:
import pandas as pd
import colorlover as cl
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

data = []
counter = 0

for col in cuisinefeatures[10:20]:
    if col != 'cuisine':
        data.append(
            go.Area(t=list(cuisine.index),
                    r=cuisine[col],
                    #marker=dict(color=cl.scales['9']['seq']['PuBu'][counter]),
                    name=col))
        counter+=1

layout = go.Layout(
    title='Most Common Ingredients Distribution Per Cuisine',
    barmode='stack',
    orientation=270,
    width=1000


)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [263]:
#This function returns a list of 100 id's for each cusine from input dataframe.
def build_df(indf):
    
    validdf_ids=[]
    random_ids=[]
    
    for k, v in indf.groupby('cuisine'):
        l=[]
        l.append([e for e in v.id])
        for i in xrange(len(l)):
            random_ids.append(np.random.choice(l[i], size=100, replace=False))
    
    validdf_ids = [item for sublist in random_ids for item in sublist]
    
    return validdf_ids

In [266]:
%%time
train_df=full_df[~full_df.id.isin(build_df(full_df))].reset_index()

CPU times: user 2.25 s, sys: 474 ms, total: 2.72 s
Wall time: 2.74 s


## Model Building

### Ingredients Map

In [268]:
%%time
#list of all possible ingredients

ingredients_all=[]

for i in full_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])
        
for i in test_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])

features=np.unique(ingredients_all) #create a feature list of unique ingrediants.

ingredients_map={v:k for k,v in enumerate(np.unique(ingredients_all).tolist())} #assign key to each ingredient

CPU times: user 1.59 s, sys: 264 ms, total: 1.85 s
Wall time: 1.84 s


### Term-Document Matrix

In [280]:
def build_designmatrix(indf):
    
    designmatrix=np.zeros((len(indf), len(features))) 
    all_ids = indf['id'] #list of all id's from input set.
    
    #build design matrix
    for i in xrange(len(indf)):
        for j in indf['ingredients'][indf['id']==all_ids[i]]:
                for k in xrange(len(j)):
                        designmatrix[i, ingredients_map[j[k]]] = 1
                        
    return designmatrix

In [281]:
def make_x(indf):
    #Your code here    
    X = build_designmatrix(indf)
    
    return X

In [282]:
import time
start_time = time.time()

X = make_x(train_df)
y = train_df['cuisine']

print("--- %s seconds ---" % (time.time() - start_time))



--- 33.074272871 seconds ---


In [283]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Generic classification and optimization functions from CS-109 labs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# clf - original classifier
# parameters - grid to search over
# X - usually your training X matrix
# y - usually your training y 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

In [284]:
# Create test/train mask
itrain, itest = train_test_split(xrange(train_df.shape[0]), train_size=0.6)
mask=np.ones(train_df.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]

### SVC Model

In [290]:
import time
start_time = time.time()

from sklearn.svm import LinearSVC
clfsvm=LinearSVC(loss="hinge")
parameters = {"C": [0.0001,0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
clf = cv_optimize(clfsvm, parameters, Xtrain, ytrain, n_folds=5, score_func='f1_weighted')
clf=clf.fit(Xtrain, ytrain)
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)


print "############# based on standard predict ################"
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)
print confusion_matrix(ytest, clf.predict(Xtest))
print "########################################################"

clfsvm = clf
svc_training_accuracy = training_accuracy
svc_test_accuracy = test_accuracy

svcTrainTime = round(time.time() - start_time)
print svcTrainTime

BEST {'C': 1.0} 0.754468659377 [mean: 0.38097, std: 0.00368, params: {'C': 0.0001}, mean: 0.56465, std: 0.00613, params: {'C': 0.001}, mean: 0.66381, std: 0.00545, params: {'C': 0.01}, mean: 0.75411, std: 0.00913, params: {'C': 0.1}, mean: 0.75447, std: 0.00603, params: {'C': 1.0}, mean: 0.70995, std: 0.00695, params: {'C': 10.0}, mean: 0.67086, std: 0.00578, params: {'C': 100.0}]
############# based on standard predict ################
Accuracy on training data: 0.92
Accuracy on test data:     0.76
[[  83    0    3    0    4    4    0    1    1    9    1    1    0   23
     0    2    7    4    4    1]
 [   1  118    5    4    1   34    4    6   19   24    3    1    2    6
     0   10   58    2    1    2]
 [   4    4  390    3    4   17    2    1    2   37    1    3    1   15
     1    3   81    6    0    1]
 [   2    2   10  840   21    6    1    7    2   19    3   33   25    8
     1    2   12    1   27   12]
 [   4    1    6   22  172    4    1    7    1    8    1    5    5    6
   


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



In [291]:
%%time
Xfinal = make_x(test_df)
ypred=clfsvm.predict(Xfinal)

CPU times: user 6.23 s, sys: 290 ms, total: 6.52 s
Wall time: 6.01 s


In [None]:
test_df['cuisine']=ypred
test_df.to_csv("SVC submission.csv")

### Decision Tree Model

A decision tree aims at minimizing entropy. Entropy is a measure of chaos; something that is very ordered has a very low entropy, something that is very messy has a very high entropy. A decision tree is structure that resembles a flow chart. Every node in the tree represents a decision that needs to be taken for determining the class. Based on the training data the most relevant features and their values are selected for lowering the entropy of the data set. In contrast to many other approaches to classification, decision trees are easy to interpret by humans. Though decision trees are mainly used for classification, they can also be used for regression.

In [295]:
start_time = time.time()
from sklearn import tree
clfTree1 = tree.DecisionTreeClassifier()
parameters = {"max_depth": [None], 'min_samples_leaf': [4, 5, 6]}

clf = cv_optimize(clfTree1, parameters, Xtrain, ytrain, n_jobs=1, n_folds=5, score_func='f1_weighted')
clf=clf.fit(Xtrain, ytrain)
    
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)
print "############# based on standard predict ################"
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)
print confusion_matrix(ytest, clf.predict(Xtest))
print "########################################################"

Decision_Tree_training_accuracy = training_accuracy
Decision_Tree_test_accuracy = test_accuracy
clfDecisionTree = clf

dtTrainTime = round(time.time() - start_time)
print dtTrainTime

BEST {'max_depth': None, 'min_samples_leaf': 6} 0.57091459164 [mean: 0.56884, std: 0.00756, params: {'max_depth': None, 'min_samples_leaf': 4}, mean: 0.56946, std: 0.00453, params: {'max_depth': None, 'min_samples_leaf': 5}, mean: 0.57091, std: 0.00815, params: {'max_depth': None, 'min_samples_leaf': 6}]
############# based on standard predict ################
Accuracy on training data: 0.69
Accuracy on test data:     0.59
[[  40    5    7    1    8    3    1    7    0   21    4    0    0   34
     1    0   14    1    1    0]
 [   2   40   10    4    6   62    8    7    6   52    0    3    0   11
     2    9   77    2    0    0]
 [   0    7  277   19    4   36    1    6    2   63    2    3    6   42
     4    1   96    6    1    0]
 [   0    1    6  689   28   36    0    9    2   30   12   76   54   16
     1    0   25    2   40    7]
 [   4    1    6   54   91   12    0    9    6   16    6    6    3   11
     0    2   24    1   13    6]
 [   4   34   20   10    5  429   16   11   16  

### Random Forest Classification Model

A random forest is a model ensemble. An ensemble combines multiple models to achieve better results than a single model would. A random forest consists of multiple decision trees. Each tree in the forest has a different random subset of the features (subspace sampling) and the trees are fed with different subsets of the training data (bagging). Though a random forest gives more accurate results than a single decision tree, it is harder to read and takes more computational time to generate.



In [298]:
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
clfForest = RandomForestClassifier()
parameters = {"n_estimators": range(1, 20)}

clf = cv_optimize(clfForest, parameters, Xtrain, ytrain, n_jobs=1, n_folds=5, score_func='f1_weighted')
clf=clf.fit(Xtrain, ytrain)

training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

print "############# based on standard predict ################"
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)
print confusion_matrix(ytest, clf.predict(Xtest))
print "########################################################"

Random_Forest_training_accuracy = training_accuracy
Random_Forest_test_accuracy = test_accuracy
clfRandomForest = clf

rfTrainTime = round(time.time() - start_time)
print rfTrainTime

BEST {'n_estimators': 19} 0.652827547131 [mean: 0.46255, std: 0.01415, params: {'n_estimators': 1}, mean: 0.46898, std: 0.00953, params: {'n_estimators': 2}, mean: 0.53395, std: 0.01127, params: {'n_estimators': 3}, mean: 0.56682, std: 0.01040, params: {'n_estimators': 4}, mean: 0.58451, std: 0.00709, params: {'n_estimators': 5}, mean: 0.60058, std: 0.00701, params: {'n_estimators': 6}, mean: 0.61270, std: 0.00736, params: {'n_estimators': 7}, mean: 0.61947, std: 0.00776, params: {'n_estimators': 8}, mean: 0.62396, std: 0.00762, params: {'n_estimators': 9}, mean: 0.63330, std: 0.00306, params: {'n_estimators': 10}, mean: 0.63477, std: 0.00421, params: {'n_estimators': 11}, mean: 0.64071, std: 0.00794, params: {'n_estimators': 12}, mean: 0.64599, std: 0.00411, params: {'n_estimators': 13}, mean: 0.64604, std: 0.00847, params: {'n_estimators': 14}, mean: 0.64330, std: 0.00500, params: {'n_estimators': 15}, mean: 0.65068, std: 0.00611, params: {'n_estimators': 16}, mean: 0.65236, std: 0.0

# Model Validation/Testing

### Horizontal Plots of Model Performances

In [311]:
import plotly.plotly as py
import plotly.graph_objs as go

timeList = [dtTrainTime, rfTrainTime, svcTrainTime]
longestTime = max(timeList)

dfTimePercentage = dtTrainTime/longestTime
rfTimePercentage = rfTrainTime/longestTime
nbTimePercentage = nbTrainTime/longestTime
svcTimePercentage = svcTrainTime/longestTime


trace0 = go.Bar(
    x=['Decision Tree', 'Random Forest', 'SVC'],
    y=[Decision_Tree_training_accuracy, Random_Forest_training_accuracy, svc_training_accuracy],
    name='Train Score',
    marker=dict(
        color='rgb(49,130,189)'
    )
)
trace1 = go.Bar(
    x=['Decision Tree', 'Random Forest', 'SVC'],
    y=[Decision_Tree_test_accuracy, Random_Forest_test_accuracy, svc_test_accuracy],
    name='Test Score',
    marker=dict(
        color='rgb(204,204,204)',
    )
)

trace2 = go.Bar(
    x=['Decision Tree', 'Random Forest', 'SVC'],
    y=[dfTimePercentage, rfTimePercentage, svcTimePercentage],
    name='Performance Time'
)


data = [trace0, trace1, trace2]
layout = go.Layout(
    xaxis=dict(
        # set x-axis' labels direction at 45 degree angle
        tickangle=0,
    ),
    barmode='group',
    title='Classifer Performance'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Model Optimization: Feature Selection and Data Balancing

# Results

### Ingredients Distribution in Test Data

In [300]:
%%time
recipe_ingredienttest = [Counter(recipe) for recipe in test_df.ingredients]
ingredient_distributiontest = sum(recipe_ingredienttest, Counter())

CPU times: user 38.1 s, sys: 963 ms, total: 39 s
Wall time: 38.8 s


### Predicted Cuisine Distribution in Test Data

In [305]:
from collections import Counter

# Find Cuisine Distribution
countdict = Counter(test_df.cuisine)
cuisines = countdict.keys()
cuisinescount = countdict.values()

# Plot Cuisine Distribution Bar Chart
data = [
    go.Bar(
        y=cuisinescount,
        x=cuisines,    
    )
]
layout = dict(
    title='Number of Recipes per Cuisine for Test Data',
)
fig = go.Figure(data=data , layout=layout)


# Creating Pie Chart
fig2 = {
    'data': [{'labels': cuisines,
              'values': cuisinescount,
              'type': 'pie'}],
    'layout': {'title': 'Number of Recipes per Cuisine for Test Data'}
}

# Plot Global Cuisine Distribution

myList=[]

for c in cuisines: 
    if c == 'brazilian':
        myList.append('BRA')
    elif c == 'british':
        myList.append('GBR')
    elif c == 'cajun_creole':
        myList.append('USA')
    elif c == 'chinese':
        myList.append('CHN')
    elif c == 'filipino':
        myList.append('PHL')
    elif c == 'french':
        myList.append('FRA')
    elif c == 'greek':
        myList.append('GRC')
    elif c == 'indian':
        myList.append('IND')
    elif c == 'irish':
        myList.append('IRL')
    elif c == 'italian':
        myList.append('ITA')
    elif c == 'jamaican':
        myList.append('JAM')
    elif c == 'japanese':
        myList.append('JPN')
    elif c == 'korean':
        myList.append('KOR')
    elif c == 'mexican':
        myList.append('MEX')
    elif c == 'moroccan':
        myList.append('MAR')
    elif c == 'russian':
        myList.append('RUS')
    elif c == 'southern_us':
        myList.append('USA')
    elif c == 'spanish':
        myList.append('ESP')
    elif c == 'thai':
        myList.append('THA')
    elif c == 'vietnamese':
        myList.append('VNM')

data3 = [ dict(
        type = 'choropleth',
        locations = myList,
        z = cuisinescount,
        text = cuisines,
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = True,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            )
        ),
        colorbar = dict(
            autotick = False,
            tickprefix = '',
            title = 'Cuisine Count'
        ),
    ) ]

layout3 = dict(
    title = 'Global Cuisine Distribution',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
fig3 = dict( data=data3, layout=layout3 )

iplot(fig)
iplot(fig2)
iplot(fig3, validate=False)

# Future Plans: Yummly Integration (Version 0.5.0)

In [None]:
from yummly import Client

# How to install Yummly
# pip install https://pypi.python.org/packages/py2.py3/y/yummly/yummly-0.5.0-py2.py3-none-any.whl
# pip install https://pypi.python.org/packages/py2.py3/y/yummly/ yummly-0.5.0.tar.gz

# About Yummly 
# https://pypi.python.org/pypi/yummly/0.5.0

# default option values
TIMEOUT = 5.0
RETRIES = 0
YOUR_API_ID = "c077aa3e"
YOUR_API_KEY = "0390dca06586895a80f9d7f549030bda"
client = Client(api_id=YOUR_API_ID, api_key=YOUR_API_KEY, timeout=TIMEOUT, retries=RETRIES)

search = client.search('green eggs and ham')
match = search.matches[0]

recipe = client.recipe(match.id)
print recipe

#### Search for recipes meeting certain criteria and limit your results to a maximum

In [None]:
from yummly import Client

results = client.search('bacon', maxResults=1)

print('Total Matches:', results.totalMatchCount)
for match in results.matches:
    print('Recipe ID:', match.id)
    print('Recipe:', match.recipeName)
    print('Rating:', match.rating)
    print('Ingredients:')
    for ingred in match.ingredientLines:
        print(ingred)
    print('----------------------------------------------------')

#### Provide search parameters:

In [None]:
params = {
    'q': 'pork chops',
    'start': 0,
    'maxResult': 40,
    'requirePicutres': False,
    'allowedIngredient[]': ['salt', 'pepper'],
    'excludedIngredient[]': ['cumin', 'paprika'],
    'maxTotalTimeInSeconds': 3600,
    'facetField[]': ['ingredient', 'diet'],
    'flavor.meaty.min': 0.5,
    'flavor.meaty.max': 1,
    'flavor.sweet.min': 0,
    'flavor.sweet.max': 0.5,
    'nutrition.FAT.min': 0,
    'nutrition.FAT.max': 15
}

results = client.search(**params, maxResults=1)