In [11]:
import pandas as pd
import numpy as np
import json
import re
import spacy
from nltk import ngrams, everygrams, bigrams

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import accuracy_score

### Data

In [12]:
train = pd.read_json('train.json').set_index('id')

In [13]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [14]:
test = pd.read_json('test.json').set_index('id')

In [15]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [16]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [17]:
len(sample_submission)

9945

In [18]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### Feature engineering

In [19]:
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

In [20]:
train['ingredients'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['ingredients'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

In [21]:
def lemmatize(old_ing):
    doc = nlp(old_ing)
    return ' '.join([token.lemma_ for token in doc])

In [22]:
train['norm'] = train['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])
test['norm'] = test['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])

In [23]:
train['words'] = train['norm'].apply(lambda x: ' '.join(x))
test['words'] = test['norm'].apply(lambda x: ' '.join(x))

In [24]:
def get_bigrams(l: list):
    ngrams_list = []
    for item in l:
        if ' ' not in item:
            ngrams_list.append(item)
        else:
            ngrams_list += ['_'.join(pair) for pair in bigrams(item.split())]
    return ' '.join(ngrams_list)

In [25]:
train['bigrams'] = train['norm'].apply(lambda x: get_bigrams(x))
test['bigrams'] = test['norm'].apply(lambda x: get_bigrams(x))

In [26]:
train

Unnamed: 0_level_0,cuisine,ingredients,norm,words,bigrams
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olive, grape tomato, g...",romaine lettuce black olive grape tomato garli...,romaine_lettuce black_olive grape_tomato garli...
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, grind pepper, salt, tomato, grin...",plain flour grind pepper salt tomato grind bla...,plain_flour grind_pepper salt tomato grind_bla...
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonaise, cook oil, green...",egg pepper salt mayonaise cook oil green chili...,egg pepper salt mayonaise cook_oil green_chili...
22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt,water vegetable_oil wheat salt
13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...,black_pepper shallot cornflour cayenne_pepper ...
...,...,...,...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulate sugar, butter, w...",light brown sugar granulate sugar butter warm ...,light_brown brown_sugar granulate_sugar butter...
11462,italian,"[kraft zesty italian dressing, purple onion, b...","[kraft zesty italian dress, purple onion, broc...",kraft zesty italian dress purple onion broccol...,kraft_zesty zesty_italian italian_dress purple...
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[egg, citrus fruit, raisin, sourdough starter,...",egg citrus fruit raisin sourdough starter flou...,egg citrus_fruit raisin sourdough_starter flou...
41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, mince garlic...",boneless chicken skinless thigh mince garlic s...,boneless_chicken chicken_skinless skinless_thi...


In [27]:
c_vectorizer_counts = CountVectorizer()
c_vectorizer_bin = CountVectorizer(binary=True)
c_vectorizer_counts_bigram = CountVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b')
c_vectorizer_bin_bigram = CountVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b', binary=True)
tfidf = TfidfVectorizer(binary=True)

In [28]:
X = tfidf.fit_transform(train['words'])
X_kaggle = tfidf.transform(test['words'])

In [29]:
print(X.shape)
print(X_kaggle.shape)

(39774, 2692)
(9944, 2692)


In [30]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

### SVC

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=8000, random_state=10)

In [33]:
model = SVC(random_state=10, max_iter=5000)

In [34]:
%%time
model.fit(X_train, y_train)

Wall time: 1min 54s




SVC(max_iter=5000, random_state=10)

In [35]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [36]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9283691068168943
0.803875


In [37]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()



0.8006487365203473

#### Cross-validation score

SVC:
0.8

LinearSVC:

Counts for words: 0.771<br>
0/1 for words: 0.786<br>
Counts for bigrams: 0.786<br>
0/1 for bigrams: 0.786<br>

### Hyperparam tuning for SVC

In [None]:
def get_best_model(model, X_tr, y_tr, param_grid):
    
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid.fit(X_tr, y_tr)
    
    return grid.best_estimator_, grid.best_score_, grid.best_params_, pd.DataFrame(grid.cv_results_)

In [None]:
model = LinearSVC(random_state=10, max_iter=10000)

In [None]:
params = {'penalty': ['l1', 'l2'],
          'class_weight': [None, 'balanced'],
          'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]}

In [None]:
model_data = get_best_model(model, X, y, params)

In [None]:
for item in model_data[:3]:
    print(item)

### SVC with best params

In [None]:
model = SVC(max_iter=10000, random_state=10)

In [None]:
model.fit(X_train, y_train)

In [None]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [None]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

In [None]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

Cross-validation score: ...

### Kaggle submission

In [38]:
model = SVC(max_iter=5000, random_state=10)

In [39]:
model.fit(X, y)



SVC(max_iter=5000, random_state=10)

In [40]:
pred_kaggle = model.predict(X_kaggle)
pred_kaggle

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [41]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,british
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
...,...
30246,french
36028,southern_us
22339,italian
42525,southern_us


In [42]:
submission.to_csv('submission7.csv')

Kaggle score: 0.80229