# Cook It Up with the Fantastic Four

In [None]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

## Data Collection

In [2]:
full_df = pd.read_json("train.json")
full_df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
test_df = pd.read_json("test.json")
test_df.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


### Validation: Holdout Method

In a supervised learning problem one wants to determine how good the trained model is. Before the model is trained, the data is split into a training data set and a test data set. The training data set is used to train the model. The test data set is kept apart and used to determine how good the model is.

In [4]:
#This function returns a list of 100 id's for each cusine from input dataframe.
def build_df(indf):
    
    validdf_ids=[]
    random_ids=[]
    
    for k, v in indf.groupby('cuisine'):
        l=[]
        l.append([e for e in v.id])
        for i in xrange(len(l)):
            random_ids.append(np.random.choice(l[i], size=100, replace=False))
    
    validdf_ids = [item for sublist in random_ids for item in sublist]
    
    return validdf_ids

In [5]:
validation_df=full_df[full_df.id.isin(build_df(full_df))].reset_index()
train_df=full_df[~full_df.id.isin(build_df(full_df))].reset_index()

In [6]:
print "Full Dataframe"
print full_df.count()
print
print "Validation Dataframe"
print validation_df.count()
print
print "Training Dataframe"
print train_df.count()

Full Dataframe
cuisine        39774
id             39774
ingredients    39774
dtype: int64

Validation Dataframe
index          2000
cuisine        2000
id             2000
ingredients    2000
dtype: int64

Training Dataframe
index          37774
cuisine        37774
id             37774
ingredients    37774
dtype: int64


### Ingredients Map

In [7]:
#list of all possible ingredients

ingredients_all=[]

for i in full_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])
        
for i in test_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])

features=np.unique(ingredients_all) #create a feature list of unique ingrediants.

ingredients_map={v:k for k,v in enumerate(np.unique(ingredients_all).tolist())} #assign key to each ingredient

### Term-Document Matrix

In [8]:
def build_designmatrix(indf):
    
    designmatrix=np.zeros((len(indf), len(features))) 
    all_ids = indf['id'] #list of all id's from input set.
    
    #build design matrix
    for i in xrange(len(indf)):
        for j in indf['ingredients'][indf['id']==all_ids[i]]:
                for k in xrange(len(j)):
                        designmatrix[i, ingredients_map[j[k]]] = 1
                        
    return designmatrix

## Modeling & Optimization

### Decision Tree Classification Model

In [9]:
%%time
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(build_designmatrix(train_df), train_df['cuisine'])
clf_predict=clf.predict(build_designmatrix(validation_df))

CPU times: user 5min 19s, sys: 3.53 s, total: 5min 23s
Wall time: 5min 24s


In [10]:
from sklearn.metrics import accuracy_score
print "Accuracy score for DecisionTreeClassifier" 
print accuracy_score(np.array(validation_df['cuisine']),clf_predict)

Accuracy score for DecisionTreeClassifier
0.931


In [11]:
predictions_as_dataframe = test_df.join(pd.DataFrame({"Prediction DT": clf_predict}))
print predictions_as_dataframe

         id                                        ingredients Prediction DT
0     18009  [baking powder, eggs, all-purpose flour, raisi...          thai
1     28583  [sugar, egg yolks, corn starch, cream of tarta...  cajun_creole
2     41580  [sausage links, fennel bulb, fronds, olive oil...  cajun_creole
3     29752  [meat cuts, file powder, smoked sausage, okra,...      filipino
4     35687  [ground black pepper, salt, sausage casings, l...        indian
5     38527  [baking powder, all-purpose flour, peach slice...        korean
6     19666             [grape juice, orange, white zinfandel]        korean
7     41217  [ground ginger, white pepper, green onions, or...        korean
8     28753  [diced onions, taco seasoning mix, all-purpose...        korean
9     22659  [eggs, cherries, dates, dark muscovado sugar, ...    vietnamese
10    21749  [pasta, olive oil, crushed red pepper, cherry ...        korean
11    44967  [water, butter, ground sumac, ground lamb, gro...       italian

In [12]:
#Predict on test set and write the result out.
final_result=clf.predict(build_designmatrix(test_df))
test_df['cuisine']=final_result
test_df.to_csv("Decision Tree Submission.csv")

### Random Forest Classification Model

A random forest is a model ensemble. An ensemble combines multiple models to achieve better results than a single model would. A random forest consists of multiple decision trees. Each tree in the forest has a different random subset of the features (subspace sampling) and the trees are fed with different subsets of the training data (bagging). Though a random forest gives more accurate results than a single decision tree, it is harder to read and takes more computational time to generate.



In [13]:
from time import time

# Machine Learning
from sklearn.ensemble import RandomForestClassifier

# Helper
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
words = [' '.join(item) for item in full_df.ingredients]

In [16]:
vec = CountVectorizer(max_features=2000)  #Convert a collection of text documents to a matrix of token counts
bag_of_words = vec.fit(words).transform(words).toarray() #Transform documents to document-term matrix.
len(bag_of_words)

39774

In [17]:
random_forest = RandomForestClassifier(n_estimators=200) #creates a 200 tree forest

In [20]:
%%time
start = time()
random_forest.fit(bag_of_words, full_df.cuisine) #Learn a vocabulary dictionary of all tokens in the raw documents

CPU times: user 3min 12s, sys: 2.02 s, total: 3min 14s
Wall time: 3min 14s


In [22]:
%%time
start = time()
train_pred = cross_val_predict(random_forest, bag_of_words, full_df.cuisine, cv=2) #Generate cross-validated estimates for each input data point

CPU times: user 2min 38s, sys: 3.36 s, total: 2min 41s
Wall time: 2min 42s


In [23]:
len(train_pred)

39774

In [24]:
print("Accuracy: ", accuracy_score(train.cuisine, train_pred))

NameError: name 'train' is not defined

In [None]:
test = pd.read_json('test.json')

In [None]:
test_words = [' '.join(item) for item in test.ingredients]
test_bag = vec.transform(test_words).toarray()

In [None]:
len(test_bag)

In [None]:
result = random_forest.predict(test_bag)

output = pd.DataFrame(data={"id":test.id, "ingredients":test.ingredients, "cuisine":result})
output.head(20)
output.to_csv("Random Forest Submission.csv")

### Ridge Regression Model

In [35]:
%%time
from sklearn import preprocessing
from sklearn.linear_model import Ridge
est = Ridge(alpha=10)

labels = train_df['cuisine']
le = preprocessing.LabelEncoder()
labels_fea = le.fit_transform(labels)
est_fit=est.fit(build_designmatrix(train_df), labels_fea)
est_predict=est_fit.predict(build_designmatrix(test_df))
labels_converted = le.inverse_transform(est_predict.astype('I'))



IndexError: index 28 is out of bounds for axis 0 with size 20

In [32]:
predictions_as_dataframe = test_df.join(pd.DataFrame({"Prediction R": labels_converted}))
print predictions_as_dataframe

ValueError: LabelEncoder was not fitted yet.