In [27]:
import pandas as pd
import numpy as np
import json

### Загружаем данные

In [28]:
train = pd.read_json('train.json').set_index('id')

In [29]:
y= train.cuisine

In [30]:
train

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ..."
11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [31]:
test = pd.read_json('test.json').set_index('id')

In [32]:
test

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,"[baking powder, eggs, all-purpose flour, raisi..."
28583,"[sugar, egg yolks, corn starch, cream of tarta..."
41580,"[sausage links, fennel bulb, fronds, olive oil..."
29752,"[meat cuts, file powder, smoked sausage, okra,..."
35687,"[ground black pepper, salt, sausage casings, l..."
...,...
30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
22339,"[black pepper, salt, parmigiano reggiano chees..."
42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


### Разведовательный анализ трейна

In [33]:
len(train.cuisine.unique())

20

In [34]:
train.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

у нас несбалансированнный датасет

In [35]:
train.isna().sum()

cuisine        0
ingredients    0
dtype: int64

In [36]:
ing_full = train.ingredients.tolist()
ing_counts = dict()

for ing_list in ing_full:
    for ingredient in ing_list:
        ing_counts[ingredient] = ing_counts.get(ingredient, 0) + 1
print(f'количество ингредиентов: {sum(ing_counts.values())}')

количество ингредиентов: 428275


In [37]:
sorted_ing = sorted(ing_counts, key=ing_counts.get, reverse=True)

print(f'количество уникальных ингредиентов: {len(sorted_ing)}\n')

for k in sorted_ing[:15]:    # популярные 
    print(k, ing_counts[k])

количество уникальных ингредиентов: 6714

salt 18049
onions 7972
olive oil 7972
water 7457
garlic 7380
sugar 6434
garlic cloves 6237
butter 4848
ground black pepper 4785
all-purpose flour 4632
pepper 4438
vegetable oil 4385
eggs 3388
soy sauce 3296
kosher salt 3113


### Работа  с признаками

Дальше работа разделилась на 4 направления:

1) Работа с ингредиентами как с бинарными признаками

2) Работа с н_граммами

3) Работа со структурой tfidf

4) Работа с вектором в pyspark (отдельный ноутбук будет сдан Александром Зубковым)

Очистка

In [38]:
import re
def clean_ingredients(old_ing):
    ing = old_ing.lower()
    ing = re.sub(r'\(.*oz\.\)', ' ', ing)
    ing = re.sub(r'[^a-zâçèéíîú]', ' ', ing)
    ing = re.sub(r' +', ' ', ing)
    ing = ing.strip()
    return ing

train['ingredients'] = train['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])
test['ingredients'] = test['ingredients'].apply(lambda x: [clean_ingredients(ing) for ing in x])

Работа с бинарными признаками

In [39]:
df_full= train.append(test)

In [40]:
from sklearn.feature_extraction import DictVectorizer
ing_dict_full = df_full[['ingredients']].to_dict(orient='records')
vectorizer = DictVectorizer()

In [41]:
import pattern
from pattern.en import lemma

for x in ing_dict_full: # лемматизация
    for k in x:
        z=x.get(k)
        x.update({k:[' '.join ([lemma(u)for u in i.split(' ')]) for i in z]})

In [42]:
full_vec = vectorizer.fit_transform(ing_dict_full)

In [43]:
df_full = pd.DataFrame(full_vec.toarray(), index=df_full.index, columns=vectorizer.feature_names_)
df_full

Unnamed: 0_level_0,ingredients=a taste of thai rice noodle,ingredients=abalone,ingredients=abbamele,ingredients=absinthe,ingredients=abura age,ingredients=acai juice,ingredients=accent,ingredients=accent season,ingredients=accompaniment,ingredients=achiote,...,ingredients=za atar,ingredients=zabaglione,ingredients=zatarain jambalaya mix,ingredients=zatarain creole season,ingredients=zest,ingredients=zesty italian dress,ingredients=zinfandel,ingredients=ziti,ingredients=zucchini,ingredients=zucchini blossom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
df_full_train=df_full[0:39774]
df_full_test=df_full[39774:]

Наиболее удачной моделью в работе сбинарными признакками по итогам ряда экспериментов стала LogisticRegression,
с указанными ниже параметрами. Кроме нее пробовали SVC (best Kaggle score: 0.78710, best Cross-val score: 0.78),
RandomForest (best Cross-val score: 0.612 kaggle  даже не загружали)

In [45]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='ovr', max_iter=2000, random_state=10)

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_full_train, y, test_size=6000, random_state=10)
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8711730917273642
0.7816666666666666


In [47]:

cross_val_score(model, df_full_train, y, cv=5, scoring='accuracy').mean()

0.7799568606855857

 сильного переобучения нет

In [48]:
pred_kaggle = model.predict(df_full_test) # проверяем скор на kaggle
submission = pd.DataFrame(pred_kaggle, index=df_full_test.index, columns=['cuisine'])
submission.to_csv('submission2.csv') # kaggle - 0.78368

Работа с n_граммами

In [59]:
from nltk import ngrams, everygrams
def get_ngrams(l: list, n: int):
    ngrams_list = []
    for item in l:
        grams = list(everygrams(item, max_len=n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='<e>'))
        ngrams_list += grams
    ngrams_list1 = [''.join(i) for i in ngrams_list]
    return set(ngrams_list1)

In [60]:
def clean_some_symbols(ing_list: list):
    new_list = sum([re.sub(r'[^a-zâçèéíîú!&\'\.\®’™]', ' ', ing).split() for ing in ing_list], [])
    new_list = list(set(new_list))
    return new_list

In [61]:
def clean_letters_only(ing_list: list):
    new_list = sum([re.sub(r'[^a-zâçèéíîú]', ' ', ing).split() for ing in ing_list], [])
    new_list = list(set(new_list))
    return new_list

In [62]:
def extract_ngrams_some_symbols(old_df):
    df = old_df.copy()
    df['voc_some_symbols'] = df['ingredients'].apply(lambda x: clean_some_symbols(x))
    df['some_symb_ngrams'] = df['voc_some_symbols'].apply(lambda x: get_ngrams(x, 3))
    return df[['some_symb_ngrams']].to_dict(orient='records')

In [63]:
def extract_ngrams_letters_only(old_df):
    df = old_df.copy()
    df['voc_letters_only'] = df['ingredients'].apply(lambda x: clean_some_symbols(x))
    df['letters_only_ngrams'] = df['voc_letters_only'].apply(lambda x: get_ngrams(x, 3))
    return df[['letters_only_ngrams']].to_dict(orient='records')

In [64]:
ngrams_train = extract_ngrams_letters_only(train)
ngrams_test = extract_ngrams_letters_only(test)

In [65]:
X = vectorizer.fit_transform(ngrams_train)
X_kaggle = vectorizer.transform(ngrams_test)

In [66]:
print(X.shape)
print(X_kaggle.shape)

(39774, 4059)
(9944, 4059)


In [70]:
X_df = pd.DataFrame(X.toarray(), index=train.index)

здесь попробуем Catboost

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=6000, random_state=10)

In [73]:
from catboost import CatBoostClassifier, Pool, cv
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_test, y_test)

In [76]:
cb = CatBoostClassifier(objective='MultiClass', eval_metric='Accuracy', random_state=10)

In [77]:
cb.fit(train_pool, eval_set=val_pool, verbose = False)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

<catboost.core.CatBoostClassifier at 0x125570612b0>

In [78]:
pred_train = cb.predict(X_train)
pred_test = cb.predict(X_test)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.8346361106176349
0.7778333333333334


best сross validation score: 0.786

Kaggle score: 0.78318

Работа со структурой tfidf

In [79]:
def lem_str (ingr_lst): # лемматизация и разделение на слова
    str_ing=''
    for i in ingr_lst:
        ing= ' '.join([lemma(u) for u in i.split(' ')])
        str_ing+=ing+' '
    return str_ing

In [80]:
train['words'] = train['ingredients'].apply(lambda x: lem_str(x))

In [81]:
train

Unnamed: 0_level_0,cuisine,ingredients,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olive grape tomatoe garl...
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour grind pepper salt tomatoe grind bl...
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",egg pepper salt mayonaise cook oil green chily...
22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallot cornflour cayenne pepper ...
...,...,...,...
29109,irish,"[light brown sugar, granulated sugar, butter, ...",light brown sugar granulate sugar butter warm ...
11462,italian,"[kraft zesty italian dressing, purple onion, b...",kraft zesty italian dress purple onion broccol...
2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...",egg citru fruit raisin sourdough starter flour...
41882,chinese,"[boneless chicken skinless thigh, minced garli...",boneles chicken skinles thigh mince garlic ste...


In [83]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# перебрали следкющие варианты:
#c_vectorizer_counts = CountVectorizer()
#c_vectorizer_bin = CountVectorizer(binary=True)
#c_vectorizer_counts_bigram = CountVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b')
#c_vectorizer_bin_bigram = CountVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b', binary=True)
#tfidf = TfidfVectorizer()
tfidf_bin = TfidfVectorizer(binary=True)
#tfidf_bigram = TfidfVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b')
#tfidf_bin_bigram = TfidfVectorizer(token_pattern=r'(?u)\b\w+_?\w+\b', binary=True)

In [86]:
test['words'] = test['ingredients'].apply(lambda x: lem_str(x))

In [88]:
X_train, X_test, y_train, y_test = train_test_split(train['words'], y, test_size=8000, random_state=10)

In [98]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
pipe = Pipeline([('tfidf', TfidfVectorizer(binary=True)),   
               ('svc', SVC(C=3, max_iter=10000, random_state=10))]) # лучшие параметры после подборки

In [99]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(binary=True)),
                ('svc', SVC(C=3, max_iter=10000, random_state=10))])

In [100]:
pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9947126581481714
0.811875


In [101]:
cross_val_score(pipe, train['words'], train['cuisine'], cv=5, scoring='accuracy').mean()

0.8099512802005624

Kaggle score: 0.81114

работа в pyspark будет сдана Александром Зубковым в отдельном ноутбуке