In [1]:
import pandas as pd
import numpy as np
import json
import re

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data

In [2]:
train = pd.read_json('train.json').set_index('id')

In [3]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [4]:
test = pd.read_json('test.json').set_index('id')

In [5]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [10]:
with open('sample_submission.csv') as f:
    sample_submission = f.readlines()

In [11]:
len(sample_submission)

9945

In [12]:
sample_submission[:5]

['id,cuisine\n',
 '35203,italian\n',
 '17600,italian\n',
 '35200,italian\n',
 '17602,italian\n']

### EDA: Train set

#### Target: cuisine

In [10]:
len(train.cuisine.unique())

20

The sample is imbalanced:

In [11]:
train.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [12]:
train.isna().sum()

cuisine        0
ingredients    0
dtype: int64

In [13]:
train.all(axis=0)

cuisine        True
ingredients    True
dtype: bool

#### Features: ingredients

In [14]:
ing_full = train.ingredients.tolist()
ing_counts = dict()

In [15]:
for ing_list in ing_full:
    for ingredient in ing_list:
        ing_counts[ingredient] = ing_counts.get(ingredient, 0) + 1

In [16]:
print(f'Number of ingredients mentioned: {sum(ing_counts.values())}')

Number of ingredients mentioned: 428275


#### Most popular ingredients

In [17]:
sorted_ing = sorted(ing_counts, key=ing_counts.get, reverse=True)

print(f'Number of unique ingredients: {len(sorted_ing)}\n')

for k in sorted_ing[:15]:
    print(k, ing_counts[k])

Number of unique ingredients: 6714

salt 18049
onions 7972
olive oil 7972
water 7457
garlic 7380
sugar 6434
garlic cloves 6237
butter 4848
ground black pepper 4785
all-purpose flour 4632
pepper 4438
vegetable oil 4385
eggs 3388
soy sauce 3296
kosher salt 3113


#### Most rare ingredients

In [17]:
for k in sorted_ing[-15:]:
    print(k, ing_counts[k])

game 1
tongue 1
Daiya 1
curry mix 1
Kraft Slim Cut Mozzarella Cheese Slices 1
Oscar Mayer Cotto Salami 1
Challenge Butter 1
orange glaze 1
cholesterol free egg substitute 1
ciabatta loaf 1
Lipton® Iced Tea Brew Family Size Tea Bags 1
Hidden Valley® Greek Yogurt Original Ranch® Dip Mix 1
lop chong 1
tomato garlic pasta sauce 1
crushed cheese crackers 1


#### Similar ingredient names

In [18]:
[(i, ing_counts[i]) for i in sorted_ing if re.search(r'tomato$', i)]

[('organic tomato', 7),
 ('yellow tomato', 7),
 ('large tomato', 5),
 ('sauce tomato', 2),
 ('low sodium tomato', 1),
 ('paste tomato', 1)]

In [19]:
[(i, ing_counts[i]) for i in sorted_ing if re.search(r'tomatoes$', i)][:6]

[('tomatoes', 3058),
 ('diced tomatoes', 1624),
 ('plum tomatoes', 858),
 ('crushed tomatoes', 453),
 ('cherry tomatoes', 436),
 ('chopped tomatoes', 298)]

In [20]:
[(i, ing_counts[i]) for i in sorted_ing if re.search(r'milk$', i)][:6]

[('milk', 2263),
 ('buttermilk', 863),
 ('coconut milk', 854),
 ('whole milk', 764),
 ('sweetened condensed milk', 271),
 ('evaporated milk', 208)]

#### Symbols in ingredients

In [21]:
symbols = list(set(''.join(sorted_ing)))
symbols.sort()
' '.join(symbols)

"  ! % & ' ( ) , - . / 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z ® â ç è é í î ú ’ € ™"

In [22]:
train['c diacritics'] = train['ingredients'].apply(lambda x: 'ç' in str(x))
train[train['c diacritics']]['cuisine'].unique()

array(['french', 'brazilian', 'spanish', 'italian', 'cajun_creole',
       'irish'], dtype=object)

In [23]:
train.drop(['c diacritics'], axis=1, inplace=True)

### EDA: Test set

In [24]:
ing_full_test = test.ingredients.tolist()
ing_counts_test = dict()

In [25]:
for ing_list in ing_full_test:
    for ingredient in ing_list:
        ing_counts_test[ingredient] = ing_counts_test.get(ingredient, 0) + 1

#### Most popular ingredients in the test set are the same as in the train set

In [26]:
sorted_ing_test = sorted(ing_counts_test, key=ing_counts_test.get, reverse=True)

for k in sorted_ing_test[:15]:
    print(k, ing_counts_test[k])

salt 4485
onions 2036
olive oil 1917
water 1836
garlic 1791
sugar 1630
garlic cloves 1535
butter 1230
ground black pepper 1205
all-purpose flour 1184
vegetable oil 1131
pepper 1070
eggs 874
soy sauce 824
kosher salt 817


In [27]:
print('{:<20} {:<20}'.format('Train set', 'Test set'))
for i in range(15):
    print(f'{sorted_ing[i]:<20} {sorted_ing_test[i]:<20}')

Train set            Test set            
salt                 salt                
onions               onions              
olive oil            olive oil           
water                water               
garlic               garlic              
sugar                sugar               
garlic cloves        garlic cloves       
butter               butter              
ground black pepper  ground black pepper 
all-purpose flour    all-purpose flour   
pepper               vegetable oil       
vegetable oil        pepper              
eggs                 eggs                
soy sauce            soy sauce           
kosher salt          kosher salt         


#### Most rare ingredients in the test set

In [28]:
for k in sorted_ing_test[-15:]:
    print(k, ing_counts_test[k])

raita 1
leftover meat 1
vanilla flavoring 1
collard green leaves 1
black gram 1
fraise 1
beef heart 1
lambs liver 1
soft cheese 1
sliced mango 1
pork strips 1
shark fillets 1
hash brown 1
porter 1
butter crackers 1


#### Ingredients from the train set that are not in the test set

Number of unknown unique ingredients

In [29]:
diff_ing = [i for i in sorted_ing_test if i not in sorted_ing]
len(diff_ing)

423

unknown unique ingredients / all unique ingredients in the test set

In [30]:
len(diff_ing) / len(sorted_ing_test)

0.09433541480820695

Number of recipes with unknown ingredients

In [31]:
test['unknown'] = test['ingredients'].apply(lambda x: bool([i for i in x if i in diff_ing]))

In [32]:
test['unknown'].value_counts()

False    9509
True      435
Name: unknown, dtype: int64

relative frequency of recipes with unknown ingredients

In [33]:
test['unknown'].value_counts(normalize=True)

False    0.956255
True     0.043745
Name: unknown, dtype: float64

In [34]:
test.drop('unknown', axis=1, inplace=True)

### Feature engineering

In [6]:
ing_dict_train = train[['ingredients']].to_dict(orient='records')
ing_dict_test = test[['ingredients']].to_dict(orient='records')

In [7]:
vectorizer = DictVectorizer()

In [8]:
X = vectorizer.fit_transform(ing_dict_train)
X_kaggle = vectorizer.transform(ing_dict_test)

In [38]:
print(X.shape)
print(X_kaggle.shape)

(39774, 6714)
(9944, 6714)


In [39]:
X_df = pd.DataFrame(X.toarray(), index=train.index, columns=vectorizer.get_feature_names_out())
X_df.head()

Unnamed: 0_level_0,ingredients=( oz.) tomato sauce,ingredients=( oz.) tomato paste,ingredients=(10 oz.) frozen chopped spinach,"ingredients=(10 oz.) frozen chopped spinach, thawed and squeezed dry",ingredients=(14 oz.) sweetened condensed milk,ingredients=(14.5 oz.) diced tomatoes,ingredients=(15 oz.) refried beans,ingredients=1% low-fat buttermilk,ingredients=1% low-fat chocolate milk,ingredients=1% low-fat cottage cheese,...,ingredients=yukon gold potatoes,ingredients=yuzu,ingredients=yuzu juice,ingredients=za'atar,ingredients=zest,ingredients=zesty italian dressing,ingredients=zinfandel,ingredients=ziti,ingredients=zucchini,ingredients=zucchini blossoms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

### Logistic Regression

In [41]:
model = LogisticRegression(multi_class='multinomial', random_state=10, max_iter=1000)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=6000, random_state=10)

In [43]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=10)

In [44]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [45]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8904482738200983
0.7796666666666666


In [46]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.77789521995345

Cross-validation score: 0.778

### Hyperparameters tuning for Logistic Regression

In [52]:
def get_best_model(model, X_tr, y_tr, param_grid):
    
    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid.fit(X_tr, y_tr)
    
    return grid.best_estimator_, grid.best_score_, grid.best_params_, pd.DataFrame(grid.cv_results_)

After a bit of experimenting, the following best model was found:

In [81]:
model = LogisticRegression(max_iter=2000, random_state=10)

In [82]:
params = {'class_weight': [None, 'balanced'],
          'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2],
          'multi_class': ['ovr', 'multinomial']}

In [78]:
model_data = get_best_model(model, X, y, params)

In [79]:
for item in model_data[:3]:
    print(item)

LogisticRegression(C=1, max_iter=2000, multi_class='ovr', random_state=10)
0.7757831749384021
{'C': 1, 'class_weight': None, 'multi_class': 'ovr'}


saga + ovr takes too much time to fit (~1 min)

In [87]:
best_model = LogisticRegression(multi_class='ovr', max_iter=2000, random_state=10)

In [89]:
best_model.fit(X_train, y_train)

LogisticRegression(max_iter=2000, multi_class='ovr', random_state=10)

In [90]:
pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)

In [91]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8713211346005804
0.7815


In [92]:
cross_val_score(best_model, X, y, cv=5, scoring='accuracy').mean()

0.7800071561699762

Cross-validation score: 0.78

### Kaggle submission

In [20]:
model = LogisticRegression(multi_class='ovr', random_state=10, max_iter=1000)

In [21]:
model.fit(X, y)

LogisticRegression(max_iter=1000, multi_class='ovr', random_state=10)

In [23]:
pred_kaggle = model.predict(X_kaggle)
pred_kaggle

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype=object)

In [24]:
submission = pd.DataFrame(pred_kaggle, index=test.index, columns=['cuisine'])
submission

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,british
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
...,...
30246,french
36028,southern_us
22339,italian
42525,southern_us


In [26]:
submission.to_csv('submission6.csv')

Kaggle score: 0.78338

### Ingredients normalization

In [9]:
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

In [10]:
train['norm'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['norm'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

In [11]:
ing_dict_train = train[['norm']].to_dict(orient='records')
ing_dict_test = test[['norm']].to_dict(orient='records')

In [12]:
vectorizer = DictVectorizer()

In [13]:
X = vectorizer.fit_transform(ing_dict_train)
X_kaggle = vectorizer.transform(ing_dict_test)

In [14]:
print(X.shape)
print(X_kaggle.shape)

(39774, 6679)
(9944, 6679)


In [15]:
X_df = pd.DataFrame(X.toarray(), index=train.index, columns=vectorizer.get_feature_names_out())
X_df.head()

Unnamed: 0_level_0,norm=a taste of thai rice noodles,norm=abalone,norm=abbamele,norm=absinthe,norm=abura age,norm=acai juice,norm=accent,norm=accent seasoning,norm=accompaniment,norm=achiote,...,norm=yuzukosho,norm=za atar,norm=zatarain s jambalaya mix,norm=zatarains creole seasoning,norm=zest,norm=zesty italian dressing,norm=zinfandel,norm=ziti,norm=zucchini,norm=zucchini blossoms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y = train['cuisine']
y

id
10259          greek
25693    southern_us
20130       filipino
22213         indian
13162         indian
            ...     
29109          irish
11462        italian
2238           irish
41882        chinese
2362         mexican
Name: cuisine, Length: 39774, dtype: object

### Logistic Regression

In [17]:
model = LogisticRegression(multi_class='ovr', random_state=10, max_iter=1000)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=6000, random_state=10)

In [19]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='ovr', random_state=10)

In [35]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [36]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8714395688991532
0.7823333333333333


In [37]:
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

0.7798311630656919

Cross validation score: 0.78