In [2]:
import pandas as pd
import numpy as np
import json
import re

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import accuracy_score

### Data

In [3]:
train = pd.read_json('train.json').set_index('id')

In [4]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [5]:
test = pd.read_json('test.json').set_index('id')

In [6]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [7]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [8]:
len(sample_submission)

9945

In [9]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### Feature engineering

In [10]:
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

In [11]:
train['ingredients'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['ingredients'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

In [12]:
ing_dict_train = train[['ingredients']].to_dict(orient='records')
ing_dict_test = test[['ingredients']].to_dict(orient='records')

In [13]:
vectorizer = DictVectorizer()

In [14]:
X = vectorizer.fit_transform(ing_dict_train)
X_kaggle = vectorizer.transform(ing_dict_test)

In [15]:
print(X.shape)
print(X_kaggle.shape)

(39774, 6679)
(9944, 6679)


In [16]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

### LinearSVC

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=8000, random_state=10)

In [None]:
model = LinearSVC(random_state=10)

In [18]:
%%time
model.fit(X_train, y_train)

Wall time: 4.6 s


LinearSVC()

In [19]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [20]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9320513627494178
0.771125


In [21]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.7674109852582582

Cross-validation score: 0.767

### SVC

Cross-val score kernel='poly': 0.604<br>
Cross-val score kernel='rbf': 0.757<br>
Cross-val score kernel='linear': 0.756<br>
Cross-val score kernel='sigmoid': 0.744

In [44]:
model = SVC(random_state=10)

In [45]:
%%time
model.fit(X_train, y_train)

Wall time: 1min 30s


SVC(random_state=10)

In [46]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [47]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9012400075533455
0.76175


In [48]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.7572284444480969

### Hyperparam tuning for LinearSVC

In [50]:
def get_best_model(model, X_tr, y_tr, param_grid):
    
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid.fit(X_tr, y_tr)
    
    return grid.best_estimator_, grid.best_score_, grid.best_params_, pd.DataFrame(grid.cv_results_)

In [61]:
model = LinearSVC(random_state=10, max_iter=10000)

In [62]:
params = {'penalty': ['l1', 'l2'],
          'class_weight': [None, 'balanced'],
          'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]}

In [63]:
model_data = get_best_model(model, X, y, params)

66 fits failed out of a total of 132.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1185, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", l

In [64]:
for item in model_data[:3]:
    print(item)

LinearSVC(C=0.1, max_iter=10000, random_state=10)
0.7802584603006989
{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}


Best LinearSVC model:<br>
LinearSVC(C=0.1, max_iter=10000, random_state=10)<br>
0.7802584603006989<br>
{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}

### LinearSVC with best params

In [68]:
model = LinearSVC(C=0.1, max_iter=10000, random_state=10)

In [69]:
model.fit(X_train, y_train)

LinearSVC(C=0.1, max_iter=10000, random_state=10)

In [70]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [71]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8805312519670171
0.786375


In [72]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.7839041901366548

Cross-validation score: 0.784

### Kaggle submission

In [73]:
model = LinearSVC(C=0.1, max_iter=10000, random_state=10)

In [74]:
model.fit(X, y)

LinearSVC(C=0.1, max_iter=10000, random_state=10)

In [76]:
pred_kaggle = model.predict(X_kaggle)
pred_kaggle

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [77]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,british
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
...,...
30246,french
36028,southern_us
22339,italian
42525,southern_us


In [78]:
submission.to_csv('submission4.csv')

Kaggle score: 0.78841