In [1]:
# import external libraries
import numpy as np
import pandas as pd
import json
from collections import defaultdict
from scipy import sparse

In [2]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

In [3]:
def convert_to_json(filename='./data/train.json'):
    """
    Reads in a file and returns json 
    """
    
    with open(filename) as infile:
        return json.load(infile)

In [4]:
def get_column_names(row):
    """
    Takes in a row of the data and returns column names
    """
    return row.keys()    

In [5]:
def get_content(row, col_name):
    """
    Takes in a row and a column name
    and returns a list of values
    """
    if col_name == 'ingredients':
        return ' '.join(row[col_name])
    else:
        return row[col_name]

In [6]:
whats_cooking_train = convert_to_json()
whats_cooking_test = convert_to_json('./data/test.json')

In [7]:
def prepare_dataset(json_repr):
    """
    Takes in a json representation of the data
    and returns a Pandas DataFrame.
    """
    
    column_names = sorted(get_column_names(json_repr[0]))
    cols = []
    
    for col_name in column_names:
        cols.append([get_content(row, col_name) for row in json_repr])
    
    data = dict(zip(column_names, cols))
    
    df = pd.DataFrame(data)
    df.set_index('id', inplace=True)
    
    return df


In [8]:
whats_cooking_train_df = prepare_dataset(whats_cooking_train)
whats_cooking_test_df = prepare_dataset(whats_cooking_test)

In [9]:
whats_cooking_train_df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,romaine lettuce black olives grape tomatoes ga...
25693,southern_us,plain flour ground pepper salt tomatoes ground...
20130,filipino,eggs pepper salt mayonaise cooking oil green c...
22213,indian,water vegetable oil wheat salt
13162,indian,black pepper shallots cornflour cayenne pepper...


In [10]:
whats_cooking_test_df.head()

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,baking powder eggs all-purpose flour raisins m...
28583,sugar egg yolks corn starch cream of tartar ba...
41580,sausage links fennel bulb fronds olive oil cub...
29752,meat cuts file powder smoked sausage okra shri...
35687,ground black pepper salt sausage casings leeks...


## Questions

In [11]:
import re

In [12]:
## What are the different unique ingredients used across various cuisines ?

def get_ingredients(cuisines):
    all_ingredients = []

    for i in range(cuisines.shape[0]):
        ## get all the ingredients
        ingredients = cuisines.iloc[i, 1].split(' ') # 1 here marks first column for ingredients
    
        for ingredient in ingredients:
            ingredient = re.sub(r'[^A-Za-z]', '', ingredient)
            # omit empty space and stopwords as ingredient name
            if len(ingredient) > 2 and ingredient not in english_stopwords:
                all_ingredients.append(ingredient.lower())
    
    return all_ingredients

def get_unique_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    return set(all_ingredients)
    

In [13]:
print len(get_unique_ingredients(whats_cooking_train_df))

3023


** There are 3023 different ingredients used across various cuisines, bearing in mind that we considered 
   e.g. black olives to be ['black', 'olive'] as two separate ingredients **

In [14]:
from collections import Counter

In [15]:
## What are the top most used ingredients ?

def get_top_most_used_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    ## counts frequency of each ingredient
    top_most_used_ingredients = Counter(all_ingredients)
    
    return sorted(top_most_used_ingredients, key=lambda x: top_most_used_ingredients[x], reverse=True)

In [16]:
top_most_used_ingredients = get_top_most_used_ingredients(whats_cooking_train_df)

In [17]:
## 20 top most used ingredients
print top_most_used_ingredients[:20]

[u'pepper', u'salt', u'oil', u'garlic', u'ground', u'fresh', u'sauce', u'sugar', u'onions', u'cheese', u'chicken', u'olive', u'black', u'water', u'red', u'flour', u'butter', u'tomatoes', u'green', u'powder']


** This seems legit, indeed these are some of the top-most used ingredients in preparation of any cuisine **

In [18]:
## Ingredients per cuisine

def get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines):
    ingredients_per_cuisine = {}
    
    for name in names_of_cuisines:
        cuisine_group = grouped_cuisines.get_group(name)
        ingredients_per_cuisine[name] = list(get_unique_ingredients(cuisine_group))
    
    return ingredients_per_cuisine

In [19]:
grouped_cuisines = whats_cooking_train_df.groupby(['cuisine'])

In [20]:
names_of_cuisines = whats_cooking_train_df.cuisine.unique()

In [21]:
ingredients_per_cuisine = get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines)

In [22]:
print ingredients_per_cuisine.keys()

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']


In [23]:
## Lets check out what ingredients define indian cuisines
print ingredients_per_cuisine['indian'][:50]

[u'freerange', u'monterey', u'portabello', u'chinese', u'mackerel', u'yellow', u'soften', u'olive', u'mild', u'fivespice', u'skim', u'shortgrain', u'gluten', u'skin', u'roots', u'mascarpone', u'milk', u'cummin', u'preserves', u'grape', u'sago', u'pattypan', u'assam', u'peanut', u'sparkling', u'granular', u'curds', u'dressing', u'couscous', u'tzatziki', u'brown', u'turnips', u'demerara', u'quorn', u'garden', u'yeast', u'citrus', u'kewra', u'vegan', u'baton', u'vadouvan', u'jalape', u'figs', u'softened', u'mooli', u'kappa', u'bhindi', u'minute', u'tortillas', u'baking']


## Preprocessing

In [24]:
cuisines_train = whats_cooking_train_df.copy()
cuisines_test = whats_cooking_test_df.copy()

In [25]:
def process_ingredient_name(ingredient_name):
    ingredient_name = re.sub(r'^A-Za-z', '', ingredient_name.lower())
    return ingredient_name

cuisines_train['ingredients'] = cuisines_train.ingredients.map(process_ingredient_name)
cuisines_test['ingredients'] = cuisines_test.ingredients.map(process_ingredient_name)

## Encoding labels

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
## training labels
train_labels = cuisines_train.cuisine

In [28]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(train_labels)

LabelEncoder()

In [29]:
target = lbl_encoder.transform(train_labels)

## Modelling

In [30]:
## online learning algorithm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.cross_validation import StratifiedShuffleSplit

In [147]:
sss = StratifiedShuffleSplit(target, test_size=0.3)

train_index, test_index = next(iter(sss))

In [148]:
train_X = cuisines_train.iloc[train_index, 1]
train_target = target[train_index]

test_X = cuisines_train.iloc[test_index, 1]
test_target = target[test_index]

In [149]:
vec = TfidfVectorizer(ngram_range=(1, 2), stop_words=english_stopwords)
X_train = vec.fit_transform(train_X)
y_train = train_target

In [150]:
pac = PassiveAggressiveClassifier(C=0.1)
pac.fit(X_train, y_train)

PassiveAggressiveClassifier(C=0.1, fit_intercept=True, loss='hinge', n_iter=5,
              n_jobs=1, random_state=None, shuffle=True, verbose=0,
              warm_start=False)

In [151]:
print 'Training score %f ', pac.score(X_train, y_train)

Training score %f  0.899777282851


In [152]:
X_test = vec.transform(test_X)
y_test = test_target

In [153]:
pac_preds = pac.predict(X_test)

In [154]:
print 'Test score %f ', pac.score(X_test, y_test)

Test score %f  0.780747319035


In [155]:
cvec = CountVectorizer()
X_train = cvec.fit_transform(train_X)

In [156]:
log = LogisticRegression(C=1, penalty='l1')
log.fit(X_train.toarray(), y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [157]:
print 'Training score %f ', log.score(X_train.toarray(), y_train)

Training score %f  0.839966951649


In [158]:
X_test = cvec.transform(test_X)
y_test = test_target

In [159]:
log_preds = log.predict(X_test.toarray())

In [160]:
print 'Test score %f ', log.score(X_test.toarray(), y_test)

Test score %f  0.778401474531


In [145]:
from sklearn.metrics import confusion_matrix

In [162]:
# confusion matrix for pac learner
print confusion_matrix(y_test, pac_preds)

[[  61    1    2    0    3    4    0    5    0   14    1    3    0   22
     0    0   17    3    4    0]
 [   1   71    1    1    2   42    0   11   14   30    0    0    3    1
     1    4   59    0    0    0]
 [   0    2  346    1    0   16    0    0    2   27    1    0    1   12
     0    2   53    1    0    0]
 [   2    1    6  691    7    3    0    7    0    8    2   14   15    4
     3    1    7    0   21   10]
 [   3    1    3   24  135    4    0    6    2   12    0    2    4    5
     0    1    9    3    8    5]
 [   1    8   12    3    7  484    5    6    3  183    2    3    0    5
     3    9   48   10    1    1]
 [   0    0    0    2    1    5  241    5    0   75    1    2    0    3
     9    0    5    3    1    0]
 [   0    1    3    2    1    2    8  823    0   10    1    1    1   15
    16    1    5    1    9    1]
 [   0   10    0    4    0   31    4    2   89   12    1    0    1    1
     2    0   39    2    1    1]
 [   2    4    9    2    1   86   18   10    4 2118    

In [167]:
lbl_encoder.classes_

array([u'brazilian', u'british', u'cajun_creole', u'chinese', u'filipino',
       u'french', u'greek', u'indian', u'irish', u'italian', u'jamaican',
       u'japanese', u'korean', u'mexican', u'moroccan', u'russian',
       u'southern_us', u'spanish', u'thai', u'vietnamese'], dtype=object)

In [164]:
# confusion matrix for logistic regression
print confusion_matrix(y_test, log_preds)

[[  75    1    2    0    4    3    0    5    0   11    1    0    0   19
     0    0   14    4    1    0]
 [   0   95    1    2    0   33    0    6   19   27    0    0    0    1
     1    2   50    4    0    0]
 [   0    3  338    1    0   11    0    0    1   24    1    0    1   11
     0    1   64    7    0    1]
 [   5    1    2  670    8    4    0    6    0   17    2   22   18    7
     0    2   12    0   21    5]
 [   5    1    2   24  135    7    0    3    2   15    0    4    0    5
     0    1   10    3    7    3]
 [   3   12    5    2    1  500    8    1    6  160    0    4    0    6
     1    9   58   16    1    1]
 [   0    0    1    1    0   10  259    7    0   54    0    1    0    3
     3    0    6    8    0    0]
 [   3    1    1    6    2    3    8  781    2   17    5    5    2   24
    20    2   13    1    3    2]
 [   0   16    0    1    2   29    3    1   93   14    0    0    0    1
     2    1   34    2    0    1]
 [   0    4   11    1    0  117   21    7    5 2073    

## One-hot encoding

In [31]:
from collections import defaultdict

In [40]:
def prepare_bag_of_ingredients(cuisines):
    one_hot_encoded = defaultdict(list)

    for i in range(cuisines.shape[0]):
        ingredient = cuisines.iloc[i, 0]

        for u_ingr in top_most_used_ingredients[:750]:
            if u_ingr in ingredient:
                one_hot_encoded[u_ingr].append(1)
            else:
                one_hot_encoded[u_ingr].append(0)
    
    return one_hot_encoded

In [36]:
sss_2 = StratifiedShuffleSplit(target, train_size=35000)

train_index, test_index = next(iter(sss_2))

train_X = cuisines_train.iloc[train_index]
train_target = target[train_index]

test_X = cuisines_train.iloc[test_index]
test_target = target[test_index]

In [37]:
bag_of_ingredients_train = prepare_bag_of_ingredients(train_X)

In [38]:
bag_of_ingredients_df_train = pd.DataFrame(bag_of_ingredients_train)

In [39]:
bag_of_ingredients_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35000 entries, 0 to 34999
Columns: 750 entries, active to zucchini
dtypes: int64(750)
memory usage: 200.5 MB


In [41]:
bag_of_ingredients_test = prepare_bag_of_ingredients(cuisines_test)

In [42]:
bag_of_ingredients_df_test = pd.DataFrame(bag_of_ingredients_test)

In [43]:
bag_of_ingredients_df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9944 entries, 0 to 9943
Columns: 750 entries, active to zucchini
dtypes: int64(750)
memory usage: 57.0 MB


In [44]:
from scipy import sparse

In [45]:
bag_of_ingredients_sparse= sparse.csr_matrix(bag_of_ingredients_df_train.values)

In [46]:
bag_of_ingredients_sparse_test = sparse.csr_matrix(bag_of_ingredients_df_test.values)

In [47]:
from sklearn.cross_validation import train_test_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_ingredients_sparse, train_target, test_size=0.2)

In [49]:
import xgboost as xgb

In [50]:
xg_train = xgb.DMatrix( X_train, label=y_train )
xg_test = xgb.DMatrix( X_test, label=y_test )

In [54]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.03
param['max_depth'] = 8
param['silent'] = 1
param['nthread'] = 8
param['num_class'] = 20
param['colsample_bytree'] = 0.7
param['subsample'] = 0.8

In [None]:
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 1000
bst = xgb.train(param, xg_train, num_round, watchlist );

[0]	train-merror:0.356000	test-merror:0.391857
[1]	train-merror:0.305929	test-merror:0.335571
[2]	train-merror:0.291393	test-merror:0.326714
[3]	train-merror:0.278429	test-merror:0.316286
[4]	train-merror:0.276679	test-merror:0.314857
[5]	train-merror:0.274607	test-merror:0.310286
[6]	train-merror:0.272393	test-merror:0.309000
[7]	train-merror:0.269571	test-merror:0.307571
[8]	train-merror:0.267107	test-merror:0.307571
[9]	train-merror:0.263929	test-merror:0.304857
[10]	train-merror:0.260536	test-merror:0.303571
[11]	train-merror:0.258643	test-merror:0.302571
[12]	train-merror:0.257929	test-merror:0.303000
[13]	train-merror:0.256536	test-merror:0.301714
[14]	train-merror:0.255321	test-merror:0.300429
[15]	train-merror:0.254786	test-merror:0.299000
[16]	train-merror:0.253643	test-merror:0.299000
[17]	train-merror:0.253500	test-merror:0.297714
[18]	train-merror:0.252250	test-merror:0.298000
[19]	train-merror:0.251607	test-merror:0.297571
[20]	train-merror:0.250857	test-merror:0.295429
[2

In [253]:
xg_full = xgb.DMatrix( bag_of_ingredients_sparse, label=train_target )

In [254]:
xgb_model = xgb.train(param, xg_full, num_round)

In [262]:
xg_test = xgb.DMatrix( bag_of_ingredients_sparse_test )

In [263]:
xgb_preds = xgb_model.predict(xg_test)

In [266]:
preds = [int(pred) for pred in xgb_preds]

## Predictions

In [168]:
train_X_feat = cvec.fit_transform(train_X)

In [169]:
log.fit(train_X_feat, train_target)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [170]:
X_test = cvec.transform(cuisines_test.ingredients)

In [171]:
preds = log.predict(X_test)

## Submission

In [268]:
preds_labels = lbl_encoder.inverse_transform(preds)

In [269]:
test_ids = cuisines_test.index.values
submission_df = pd.DataFrame({'id': test_ids, 'cuisine': preds_labels})
submission_df.to_csv('./submissions/xgb_preds.csv', index=False)