In [40]:
import pandas as pd
import numpy as np
import json
import re

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from nltk import ngrams, everygrams

from catboost import CatBoostClassifier

### Data

In [2]:
train = pd.read_json('train.json').set_index('id')

In [3]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [4]:
test = pd.read_json('test.json').set_index('id')

In [5]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [6]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [7]:
len(sample_submission)

9945

In [8]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### Target: cuisine

In [9]:
len(train.cuisine.unique())

20

The sample is imbalanced:

In [10]:
train.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [11]:
train.isna().sum()

cuisine        0
ingredients    0
dtype: int64

In [12]:
train.all(axis=0)

cuisine        True
ingredients    True
dtype: bool

### Feature engineering

In [14]:
def get_ngrams(l: list, n: int):
    ngrams_list = []
    for item in l:
        grams = list(everygrams(item, max_len=n))
        ngrams_list += grams
    ngrams_list1 = [''.join(i) for i in ngrams_list]
    return set(ngrams_list1)

In [15]:
train['ngrams'] = train['ingredients'].apply(lambda x: get_ngrams(x, 3))
test['ngrams'] = test['ingredients'].apply(lambda x: get_ngrams(x, 3))

In [16]:
ngrams_dict_train = train[['ngrams']].to_dict(orient='records')
ngrams_dict_test = test[['ngrams']].to_dict(orient='records')

In [17]:
vectorizer = DictVectorizer()

In [18]:
X = vectorizer.fit_transform(ngrams_dict_train)
X_kaggle = vectorizer.transform(ngrams_dict_test)

In [19]:
print(X.shape)
print(X_kaggle.shape)

(39774, 6319)
(9944, 6319)


In [20]:
X_df = pd.DataFrame(X.toarray(), index=train.index, columns=vectorizer.get_feature_names_out())
X_df.head()

Unnamed: 0_level_0,ngrams=,ngrams=,ngrams=,ngrams= b,ngrams= c,ngrams= o,ngrams= &,ngrams= &,ngrams= ',ngrams= 'N,...,ngrams=™ -,ngrams=™ C,ngrams=™ R,ngrams=™ T,ngrams=™ c,ngrams=™ m,ngrams=™ o,ngrams=™ r,ngrams=™ s,ngrams=™ t
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10259,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25693,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20130,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22213,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13162,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

### Logistic Regression

In [31]:
model = LogisticRegression(multi_class='ovr', random_state=10, max_iter=1000)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=6000, random_state=10)

In [33]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='ovr', random_state=10)

In [35]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [36]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9001302777284301
0.7806666666666666


The model overfits.

In [37]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.7819933031018869

Cross-validation score: 0.782

### Kaggle submission

In [None]:
pred_kaggle = model.predict(X_kaggle)
pred_kaggle

In [None]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

In [None]:
submission.to_csv('submission3.csv')

Kaggle score: ...