In [23]:
import pandas as pd
import numpy as np
import re
import spacy
import optuna

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

import lightgbm as lgb

### Data

In [24]:
train = pd.read_json('train.json').set_index('id')

In [25]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [26]:
test = pd.read_json('test.json').set_index('id')

In [27]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [28]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [29]:
len(sample_submission)

9945

In [30]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### Feature engineering

In [31]:
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

In [32]:
train['ingredients'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['ingredients'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

In [33]:
def lemmatize(old_ing):
    doc = nlp(old_ing)
    return ' '.join([token.lemma_ for token in doc])

In [34]:
train['norm'] = train['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])
test['norm'] = test['ingredients'].apply(lambda x: [lemmatize(ing) for ing in x])

In [35]:
train['words'] = train['norm'].apply(lambda x: ' '.join(x))
test['words'] = test['norm'].apply(lambda x: ' '.join(x))

In [36]:
train

Unnamed: 0_level_0,cuisine,ingredients,norm,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olive, grape tomato, g...",romaine lettuce black olive grape tomato garli...
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, grind pepper, salt, tomato, grin...",plain flour grind pepper salt tomato grind bla...
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonaise, cook oil, green...",egg pepper salt mayonaise cook oil green chili...
22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...
...,...,...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulate sugar, butter, w...",light brown sugar granulate sugar butter warm ...
11462,italian,"[kraft zesty italian dressing, purple onion, b...","[kraft zesty italian dress, purple onion, broc...",kraft zesty italian dress purple onion broccol...
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[egg, citrus fruit, raisin, sourdough starter,...",egg citrus fruit raisin sourdough starter flou...
41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, mince garlic...",boneless chicken skinless thigh mince garlic s...


In [37]:
test

Unnamed: 0_level_0,ingredients,norm,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18009,"[baking powder, eggs, all purpose flour, raisi...","[bake powder, egg, all purpose flour, raisin, ...",bake powder egg all purpose flour raisin milk ...
28583,"[sugar, egg yolks, corn starch, cream of tarta...","[sugar, egg yolk, corn starch, cream of tartar...",sugar egg yolk corn starch cream of tartar ban...
41580,"[sausage links, fennel bulb, fronds, olive oil...","[sausage link, fennel bulb, frond, olive oil, ...",sausage link fennel bulb frond olive oil cuban...
29752,"[meat cuts, file powder, smoked sausage, okra,...","[meat cut, file powder, smoke sausage, okra, s...",meat cut file powder smoke sausage okra shrimp...
35687,"[ground black pepper, salt, sausage casings, l...","[grind black pepper, salt, sausage casing, lee...",grind black pepper salt sausage casing leek pa...
...,...,...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo...","[large egg yolk, fresh lemon juice, sugar, bou...",large egg yolk fresh lemon juice sugar bourbon...
36028,"[hot sauce, butter, sweet potatoes, adobo sauc...","[hot sauce, butter, sweet potato, adobo sauce,...",hot sauce butter sweet potato adobo sauce salt
22339,"[black pepper, salt, parmigiano reggiano chees...","[black pepper, salt, parmigiano reggiano chees...",black pepper salt parmigiano reggiano cheese r...
42525,"[cheddar cheese, cayenne, paprika, plum tomato...","[cheddar cheese, cayenne, paprika, plum tomato...",cheddar cheese cayenne paprika plum tomato gre...


In [38]:
X = train['words']
X_kaggle = test['words']

In [39]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

In [57]:
tfidf = TfidfVectorizer(binary=True)

X = tfidf.fit_transform(train['words'])
X_kaggle = tfidf.transform(test['words'])

### Hyperparam tuning for LightGBM

In [25]:
def get_best_model(model, X_tr, y_tr, param_grid):
    
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid.fit(X_tr, y_tr)
    
    return grid.best_estimator_, grid.best_score_, grid.best_params_, pd.DataFrame(grid.cv_results_)

In [45]:
pipe = Pipeline([('lgb', lgb.LGBMClassifier(random_state=10))])

In [63]:
params = {'lgb__num_leaves': [25, 30, 31, 35], 'lgb__min_child_samples': [10]}

In [64]:
%%time
model_data = get_best_model(pipe, X, y, params)

Wall time: 6min 38s


In [65]:
for item in model_data[:3]:
    print(item)

Pipeline(steps=[('lgb', LGBMClassifier(min_child_samples=10, random_state=10))])
0.7799316136169357
{'lgb__min_child_samples': 10, 'lgb__num_leaves': 31}


### LightGBM with best params

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=8000, random_state=10)

In [61]:
params = {'min_child_samples': 10,
         'random_state': 10,
         'objective': 'multiclass',
         'num_class': 20,
         'n_estimators': 200}

In [62]:
gb = lgb.LGBMClassifier(**params)

In [63]:
gb.fit(X_train, y_train)

LGBMClassifier(min_child_samples=10, n_estimators=200, num_class=20,
               objective='multiclass', random_state=10)

In [64]:
pred_train = gb.predict(X_train)
pred_test = gb.predict(X_test)

In [65]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.999087304085101
0.796875


In [66]:
cross_val_score(gb, X, y, cv=5, scoring='accuracy').mean()

0.791999831842649

Cross-validation score: 0.792

### Kaggle submission

In [67]:
gb = lgb.LGBMClassifier(**params)

In [68]:
gb.fit(X, y)

LGBMClassifier(min_child_samples=10, n_estimators=200, num_class=20,
               objective='multiclass', random_state=10)

In [69]:
pred_kaggle = gb.predict(X_kaggle)
pred_kaggle

array(['southern_us', 'southern_us', 'italian', ..., 'italian',
       'southern_us', 'mexican'], dtype=object)

In [70]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,southern_us
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
...,...
30246,french
36028,southern_us
22339,italian
42525,southern_us


In [71]:
submission.to_csv('submission18.csv')

Kaggle score: 0.78801