In [1]:
import numpy as np
from sklearn import linear_model
import json
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

%matplotlib inline 


In [2]:
f = open("train.json/train.json", "r")
data = json.load(f)
type(data[0])

dict

In [3]:
data[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
data[0]['ingredients'][0]

'romaine lettuce'

# b.) Tell us about the data...
How many samples (dishes) are there in the training set? How many
categories (types of cuisine)? Use a list to keep all the unique ingredients appearing in the
training set. How many unique ingredients are there?

In [5]:
cuisineTypes = set()
uniqueIngredients = set()
for recipe in data:
    cuisineTypes.add(recipe['cuisine'])
    ingredients = set(recipe['ingredients'])
    uniqueIngredients = uniqueIngredients.union(ingredients)
         
print("There are " + str(len(data)) + " dishes in the dataset.")
print("There are", len(uniqueIngredients), "unique types of ingredients in the dataset.")        
print("There are", len(cuisineTypes), "types of cuisine in the dataset.")

There are 39774 dishes in the dataset.
There are 6714 unique types of ingredients in the dataset.
There are 20 types of cuisine in the dataset.


# d.)
Represent each dish by a binary ingredient feature vector. Suppose there are d different ingredients
in total from the training set, represent each dish by a 1×d binary ingredient vector
x, where xi = 1 if the dish contains ingredient i and xi = 0 otherwise. For example, suppose
all the ingredients we have in the training set are { beef, chicken, egg, lettuce, tomato, rice }
and the dish is made by ingredients { chicken, lettuce, tomato, rice }, then the dish could be
represented by a 6×1 binary vector [0, 1, 0, 1, 1, 1] as its feature or attribute. Use n ×d feature
matrix to represent all the dishes in training set and test set, where n is the number of dishes.

# Only run the following cells on your machine once to generate feature_mat.

In [6]:
uniqueIngredients = list(uniqueIngredients)

def vectorform(recipe):
    return np.in1d(uniqueIngredients, recipe['ingredients'], assume_unique = True).astype(int)

In [7]:
feature_mat_train = np.array(list(map(vectorform, data)))

print("Generated feature matrix of shape:", feature_mat_train.shape)

with open('feature_train.pickle', 'wb') as handle:
    pickle.dump(feature_mat_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

Generated feature matrix of shape: (39774, 6714)


In [8]:
f = open("test.json/test.json", "r")
data_test = json.load(f)

feature_mat_test = np.array(list(map(vectorform, data_test)))

print("Generated feature matrix of shape:", feature_mat_test.shape)

with open('feature_test.pickle', 'wb') as handle:
    pickle.dump(feature_mat_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

Generated feature matrix of shape: (9944, 6714)


# Run the following cell to load feature mats without regenerating.

In [9]:
feature_mat_train = pickle.load(open('feature_train.pickle', 'rb'))
feature_mat_test = pickle.load(open('feature_test.pickle', 'rb'))
print("Loaded feature matrices of shapes:", feature_mat_train.shape, feature_mat_test.shape)
train_expected = np.array([recipe['cuisine'] for recipe in data])

Loaded feature matrices of shapes: (39774, 6714) (9944, 6714)


# e.)
Using Naïve Bayes Classifier to perform 3 fold cross-validation on the training set and report
your average classification accuracy. Try both Gaussian distribution prior assumption and
Bernoulli distribution prior assumption.

In [8]:
gnb = GaussianNB()
scores = cross_val_score(gnb, feature_mat_train, train_expected)
print(scores, np.mean(scores))

[ 0.37925055  0.38185384  0.377377  ] 0.379493793821


In [20]:
# X_train, X_test, y_train, y_test = train_test_split(feature_mat_train, train_expected, test_size=0.33, random_state=0)
# y_pred = gnb.fit(X_train, iris.target).predict(iris.data)

In [9]:
bnb = BernoulliNB()
scores = cross_val_score(bnb, feature_mat_train, train_expected)
print(scores, np.mean(scores))

[ 0.68302797  0.68225356  0.68548144] 0.683587657646


# e.) 
For Gaussian prior and Bernoulli prior, which performs better in terms of cross-validation
accuracy? Why? Please give specific arguments.

# f.)
Using Logistic Regression Model to perform 3 fold cross-validation on the training set and
report your average classification accuracy.

In [10]:
logreg = linear_model.LogisticRegression(penalty='l2', multi_class='ovr')
scores = cross_val_score(logreg, feature_mat_train, train_expected)
print(scores, np.mean(scores))

[ 0.774787    0.77366317  0.77882584] 0.775758670409


# g.) 
Train your best-performed classifier with all of the training data, and generate test labels on
test set. Submit your results to Kaggle and report the accuracy.

In [11]:
logreg = linear_model.LogisticRegression(penalty='l2', multi_class='ovr')
logreg.fit(np.asarray(feature_mat_train), train_expected)
result = logreg.predict(feature_mat_test)

f = open("test.json/test.json", "r")
data_test = json.load(f)

with open('submission.csv', 'w') as f:
    f.write("id,cuisine\n")
    for label in range(len(result)):
        f.write(str(data_test[label]['id']) + "," + result[label] + "\n")

![Kaggle Submission](kaggle_score.png)


# The submitted file was renamed to submission.csv