In [8]:
# 1-b Tell us about the data
import numpy as np
from matplotlib import pylab as plt
import json
from pprint import pprint

with open("train.json", "r") as f:
    data = json.load(f)
#pprint(data[0])
ids, cuisine, ingredients = [], [], []
for d in data:
    ids.append(d["id"])
    cuisine.append(d["cuisine"])
    ingredients.extend(d["ingredients"])    
print("Total id: {}".format(len(ids)))
print("Total cuisine unique: {}".format(len(set(cuisine))))
print("Total ingredients: {}".format(len(ingredients)))
print("Total ingredients unique: {}".format(len(set(ingredients))))

Total id: 39774
Total cuisine unique: 20
Total ingredients: 428275
Total ingredients unique: 6714


In [13]:
# 1-c Feature Vector Extration
ing_dict = dict()
index = 0
for i in set(ingredients):
    ing_dict[i] = index
    index = index + 1
train_data = []
ing_count = len(set(ingredients))
for d in data:
    cur = [0] * ing_count
    for i in d["ingredients"]:
        cur[ing_dict[i]] = 1
    train_data.append(cur)
train_data = np.array(train_data, dtype=int)
train_labels = np.array(cuisine, dtype=str)

(39774, 6714)


In [28]:
# 1-d Na√Øve Bayes
# 1-f Logistic Regression
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
clf_G = GaussianNB()
clf_B = BernoulliNB()
clf_M = MultinomialNB()
clf_L = LogisticRegression()
fold = 3
kf = KFold(len(train_data), n_folds=fold)
avg_G, avg_B, avg_M, avg_L = 0, 0, 0, 0
for train_index, test_index in kf:
    data_tr, data_t = train_data[train_index], train_data[test_index]
    labels_tr, labels_t = train_labels[train_index], train_labels[test_index]
    clf_G.fit(data_tr, labels_tr)
    clf_B.fit(data_tr, labels_tr)
    clf_M.fit(data_tr, labels_tr)
    clf_L.fit(data_tr, labels_tr)
    avg_G += clf_G.score(data_t, labels_t) / fold
    avg_B += clf_B.score(data_t, labels_t) / fold
    avg_M += clf_M.score(data_t, labels_t) / fold
    avg_L += clf_L.score(data_t, labels_t) / fold
print("Gaussian: {}".format(avg_G))
print("Bernoulli: {}".format(avg_B))
print("Multinomial: {}".format(avg_M))
print("Logistic Regression: {}".format(avg_L))

Gaussian: 0.3798461306381053
Bernoulli: 0.6835369839593705
Multinomial: 0.7220043244330467
Logistic Regression: 0.7755568964650275


In [39]:
# 1-g Kaggle
with open("test.json", "r") as f:
    test = json.load(f)
test_data = []
test_ids = []
for d in test:
    test_ids.append(d["id"])
    cur = [0] * ing_count
    for i in d["ingredients"]:
        if i in ing_dict:
            cur[ing_dict[i]] = 1
    test_data.append(cur)
test_ids = np.array(test_ids)
test_data = np.array(test_data)    
clf_M.fit(train_data, train_labels)
clf_L.fit(train_data, train_labels)
pred_M = clf_M.predict(test_data)
pred_L = clf_L.predict(test_data)
np.savetxt("Test_Ids.csv", test_ids, delimiter=",")
np.savetxt("Kaggle_Log.csv", pred_L, delimiter=",", fmt="%s")
np.savetxt("Kaggle_NB.csv", pred_M, delimiter=",", fmt="%s")

(9944,)
(9944, 6714)
(9944,)
(9944,)
