In [1]:
#import the necessary library
%matplotlib inline
import numpy as npy
import pandas as panda
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from itertools import chain
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# read data into date frames 
train = panda.read_json("train.json")
shape = train.shape[0]
print ("The dishes number is %d." % shape)

The dishes number is 39774.


In [3]:
# handle the ingredients case
ingredients = set(chain.from_iterable(train.ingredients))
length = len(ingredients)
print ("There are in total %d ingredients." % length)

There are in total 6714 ingredients.


In [4]:
# calculate labels
labels = train.cuisine
label_enc = LabelEncoder()
y = label_enc.fit_transform(labels)

assert len(label_enc.classes_) == len(set(labels))
assert y.shape[0] == train.shape[0]
labels_length = len(label_enc.classes_)
print ("There are in total %d labels" % labels_length)

print (label_enc.classes_)

There are in total 20 labels
['brazilian' 'british' 'cajun_creole' 'chinese' 'filipino' 'french'
 'greek' 'indian' 'irish' 'italian' 'jamaican' 'japanese' 'korean'
 'mexican' 'moroccan' 'russian' 'southern_us' 'spanish' 'thai'
 'vietnamese']


In [7]:
# Here we use CountVectorizer to encode ingredients feature. 
# Here, since we have commas in string list, we can not split by commas
# The transformed feature matrix X is represented by sparse matrix (csr)

ingredients_list = list(map(lambda r: "sepearate".join(r), train.ingredients))
assert len(ingredients_list) == train.shape[0]

enc = CountVectorizer(vocabulary= ingredients, 
                      tokenizer=lambda x : x.split('sepearate'))
X = enc.fit_transform(ingredients_list)

assert X.shape == (train.shape[0], length)


In [8]:
# Naive Bayes under Gaussian and Bernoulli prior assumption
# Logistic Regression models to do classification.
r_accum = []
e_accum = []
kf = KFold(n_splits = 3)

for r_idx, e_idx in kf.split(X):
    r_accum.append(r_idx)
    e_accum.append(e_idx)
cls_gaussian_nb = GaussianNB()
cls_gaussian_nb.fit(X[r_accum[0]].toarray(), y[r_accum[0]])
gau_score = cls_gaussian_nb.score(X[e_accum[0]].toarray(), y[e_accum[0]])
print (gau_score)


0.3706441393875396


In [9]:
# Naiive Bayes Classifier - Bernouli prior assumption
cls_bernoulli_nb = BernoulliNB()
cls_bernoulli_nb.fit(X[:30000], y[:30000])
ber_score = cls_bernoulli_nb.score(X[30000:], y[30000:])
print (ber_score)

0.6903007980356046


In [11]:
# Logistic Regression Model to perform 3 fold cross-validation
cls_lr = LogisticRegression()
cls_lr.fit(X[r_accum[0]], y[r_accum[0]])
lr_score = cls_lr.score(X[e_accum[0]], y[e_accum[0]])
print (lr_score)



0.7710816111027304


In [None]:
# Test phase
test = panda.read_json('test.json')
print (test.columns)
X_te = enc.transform(map(lambda r: "sepearate".join(r), test.ingredients))

In [None]:
print (X_te.shape)
mdl = LogisticRegression()
mdl.fit(X, y)

In [79]:
y_te = mdl.predict(X_te)
labels_te = label_enc.inverse_transform(y_te)
ret = npy.column_stack((test.id, labels_te))
npy.savetxt('testResult_cooking.csv', ret, delimiter=',', fmt='%s', header='id,cuisine', comments='')