In [2]:
import pandas as pd
import numpy as np
import re
import spacy
import optuna

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier

### Data

In [3]:
train = pd.read_json('train.json').set_index('id')

In [4]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [5]:
test = pd.read_json('test.json').set_index('id')

In [6]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [7]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [8]:
len(sample_submission)

9945

In [9]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### Feature engineering

In [10]:
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

In [11]:
train['ingredients'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['ingredients'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

In [12]:
def lemmatize(old_ing):
    doc = nlp(old_ing)
    return ' '.join([token.lemma_ for token in doc])

In [13]:
train['norm'] = train['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])
test['norm'] = test['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])

In [14]:
train['words'] = train['norm'].apply(lambda x: ' '.join(x))
test['words'] = test['norm'].apply(lambda x: ' '.join(x))

In [15]:
train

Unnamed: 0_level_0,cuisine,ingredients,norm,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olive, grape tomato, g...",romaine lettuce black olive grape tomato garli...
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, grind pepper, salt, tomato, grin...",plain flour grind pepper salt tomato grind bla...
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonaise, cook oil, green...",egg pepper salt mayonaise cook oil green chili...
22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...
...,...,...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulate sugar, butter, w...",light brown sugar granulate sugar butter warm ...
11462,italian,"[kraft zesty italian dressing, purple onion, b...","[kraft zesty italian dress, purple onion, broc...",kraft zesty italian dress purple onion broccol...
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[egg, citrus fruit, raisin, sourdough starter,...",egg citrus fruit raisin sourdough starter flou...
41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, mince garlic...",boneless chicken skinless thigh mince garlic s...


In [16]:
test

Unnamed: 0_level_0,ingredients,norm,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18009,"[baking powder, eggs, all purpose flour, raisi...","[bake powder, egg, all purpose flour, raisin, ...",bake powder egg all purpose flour raisin milk ...
28583,"[sugar, egg yolks, corn starch, cream of tarta...","[sugar, egg yolk, corn starch, cream of tartar...",sugar egg yolk corn starch cream of tartar ban...
41580,"[sausage links, fennel bulb, fronds, olive oil...","[sausage link, fennel bulb, frond, olive oil, ...",sausage link fennel bulb frond olive oil cuban...
29752,"[meat cuts, file powder, smoked sausage, okra,...","[meat cut, file powder, smoke sausage, okra, s...",meat cut file powder smoke sausage okra shrimp...
35687,"[ground black pepper, salt, sausage casings, l...","[grind black pepper, salt, sausage casing, lee...",grind black pepper salt sausage casing leek pa...
...,...,...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo...","[large egg yolk, fresh lemon juice, sugar, bou...",large egg yolk fresh lemon juice sugar bourbon...
36028,"[hot sauce, butter, sweet potatoes, adobo sauc...","[hot sauce, butter, sweet potato, adobo sauce,...",hot sauce butter sweet potato adobo sauce salt
22339,"[black pepper, salt, parmigiano reggiano chees...","[black pepper, salt, parmigiano reggiano chees...",black pepper salt parmigiano reggiano cheese r...
42525,"[cheddar cheese, cayenne, paprika, plum tomato...","[cheddar cheese, cayenne, paprika, plum tomato...",cheddar cheese cayenne paprika plum tomato gre...


In [17]:
X = train['words']
X_kaggle = test['words']

In [18]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

We tried different combinations of language units and vectorizers + SVC with default params

|Type of data|Accuracy Train|Accuracy Test|Cross-val score|
|:-|:-:|:-:|:-:|
|CountVectorizer for words|0.87565|0.78413|
|Binary CountVectorizer for words|0.88557|0.78638|
|CountVectorizer for bigrams|0.89668|0.766|
|Binary CountVectorizer for bigrams|0.89765|0.7675|
|TfidfVectorizer for words|0.92610|0.8035|0.80027|
|<span style="color:blue">BinaryVectorizer tfidf for words</span>|<span style="color:blue">0.92837</span>|<span style="color:blue">0.804</span>|<span style="color:blue">0.80067</span>|
|Binary TfidfVectorizer for words, l1 norm|0.92588|0.79625|
|TfidfVectorizer for bigrams|0.94341|0.786|
|Binary TfidfVectorizer tfidf for bigrams|0.94392|0.78813|

### Hyperparam tuning for SVC (optuna)

We experimented a lot with hyperparam tuning, here are some examples of it.

In [30]:
def objective(trial):
    
    params = {'C': trial.suggest_float('C', 1, 30, log=True),
          'gamma': trial.suggest_float('gamma', 0.5, 3, log=True),
          'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])}
    
    pipe = Pipeline([('tfidf', TfidfVectorizer(binary=True)),
               ('svc', SVC(**params, max_iter=20000, random_state=10))])
    
    accuracy = cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()
    print(f'Accuracy: {accuracy:.5f}')
    
    return accuracy

In [31]:
%%time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[32m[I 2021-12-20 08:37:47,773][0m A new study created in memory with name: no-name-3c2ab954-9f79-4c94-9dfe-8416be5fa786[0m
[32m[I 2021-12-20 08:44:26,149][0m Trial 0 finished with value: 0.8041434102680144 and parameters: {'C': 4.236494584770727, 'gamma': 1.0104185713109552, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.80414


[32m[I 2021-12-20 08:52:28,133][0m Trial 1 finished with value: 0.8019309096394629 and parameters: {'C': 15.459540206779764, 'gamma': 1.3916735469338912, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.80193


[32m[I 2021-12-20 09:03:30,211][0m Trial 2 finished with value: 0.7916980942324132 and parameters: {'C': 25.276294651125138, 'gamma': 2.048095207825232, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.79170


[32m[I 2021-12-20 09:09:28,212][0m Trial 3 finished with value: 0.800346960325841 and parameters: {'C': 14.517181778613203, 'gamma': 0.7542954145700342, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.80035


[32m[I 2021-12-20 09:16:22,435][0m Trial 4 finished with value: 0.8029868758485442 and parameters: {'C': 9.880343630072009, 'gamma': 1.057696487285074, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.80299


[32m[I 2021-12-20 09:31:32,393][0m Trial 5 finished with value: 0.7686176899482073 and parameters: {'C': 20.571098723929076, 'gamma': 2.641094747718375, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.76862


[32m[I 2021-12-20 09:37:25,214][0m Trial 6 finished with value: 0.7968522150148338 and parameters: {'C': 3.306433195254277, 'gamma': 0.7013399596995858, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.79685


[32m[I 2021-12-20 09:42:59,238][0m Trial 7 finished with value: 0.7988635792226079 and parameters: {'C': 8.969877113273366, 'gamma': 0.6166977429419787, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.79886


[32m[I 2021-12-20 09:59:12,185][0m Trial 8 finished with value: 0.7574797606476592 and parameters: {'C': 4.736594276217824, 'gamma': 2.835117880299622, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.75748


[32m[I 2021-12-20 10:06:33,945][0m Trial 9 finished with value: 0.8028863076381555 and parameters: {'C': 3.9221083752629107, 'gamma': 1.1996703375674334, 'class_weight': None}. Best is trial 0 with value: 0.8041434102680144.[0m


Accuracy: 0.80289
Wall time: 1h 28min 46s


In [32]:
print(f'Number of finished trials: {len(study.trials)}')

print('Best trial:')
trial = study.best_trial

print(f'  Value: {trial.value}')

print('   Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

Number of finished trials: 10
Best trial:
  Value: 0.8041434102680144
   Params:
    C: 4.236494584770727
    gamma: 1.0104185713109552
    class_weight: balanced


### Hyperparam tuning for SVC (GridSearchCV)

Also some examples of hyperparam tuning.

In [19]:
def get_best_model(model, X_tr, y_tr, param_grid):
    
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid.fit(X_tr, y_tr)
    
    return grid.best_estimator_, grid.best_score_, grid.best_params_, pd.DataFrame(grid.cv_results_)

In [20]:
pipe = Pipeline([('tfidf', TfidfVectorizer(binary=True)),
               ('svc', SVC(max_iter=20000, random_state=10))])

In [21]:
params = {'svc__C': [3, 4.236],
         'svc__gamma': ['auto', 'scale']}

In [22]:
model_data = get_best_model(pipe, X, y, params)

In [23]:
for item in model_data[:3]:
    print(item)

Pipeline(steps=[('tfidf', TfidfVectorizer(binary=True)),
                ('svc', SVC(C=3, max_iter=20000, random_state=10))])
0.8053250867400813
{'svc__C': 3, 'svc__gamma': 'scale'}


### SVC with best params

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=8000, random_state=10)

In [36]:
pipe = Pipeline([('tfidf', TfidfVectorizer(binary=True)),
               ('svc', SVC(C=3, max_iter=20000, random_state=10))])

In [37]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(binary=True)),
                ('svc', SVC(C=3, max_iter=20000, random_state=10))])

In [38]:
pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

In [39]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9946497136023164
0.81125


In [40]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.8103535713760787

Cross-validation score: 0.81035

### Kaggle submission

In [25]:
pipe = Pipeline([('tfidf', TfidfVectorizer(binary=True)),
               ('svc', SVC(C=3, max_iter=20000, random_state=10))])

In [26]:
pipe.fit(X, y)

Pipeline(steps=[('tfidf', TfidfVectorizer(binary=True)),
                ('svc', SVC(C=3, max_iter=20000, random_state=10))])

In [27]:
pred_kaggle = pipe.predict(X_kaggle)
pred_kaggle

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [28]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,british
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
...,...
30246,french
36028,southern_us
22339,italian
42525,southern_us


In [29]:
submission.to_csv('submission16.csv')

Kaggle score: 0.81114