# Import and Prepare Data

In [22]:
import pandas as pd
data = pd.read_parquet('data_with_categories.parquet').reset_index(drop = True)

In [23]:
data

Unnamed: 0,product_id,category_id,sale,shop_id,shop_title,rating,text_fields,category_name,cat_1,cat_2
0,325286,12171,False,9031,Aksik,5.00000,"{""title"": ""Зарядный кабель Borofone BX1 Lightn...",Аксессуары и запчасти->Зарядные устройства и к...,Электроника,Смартфоны и телефоны
1,888134,14233,False,18305,Sela,5.00000,"{""title"": ""Трусы Sela"", ""description"": ""Трусы-...",Белье и купальники->Трусы,Одежда,Женская одежда
2,1267173,13429,False,16357,ЮНЛАНДИЯ канцтовары,5.00000,"{""title"": ""Гуашь \""ЮНЫЙ ВОЛШЕБНИК\"", 12 цветов...","Краски, пигменты",Хобби и творчество,Рисование
3,1416943,2789,False,34666,вася-nicotine,4.00000,"{""title"": ""Колба для кальяна Крафт (разные цве...",Кальяны и аксессуары->Колбы,Хобби и творчество,Товары для курения
4,1058275,12834,False,26389,Lim Market,4.60000,"{""title"": ""Пижама женская, однотонная с шортам...",Домашняя одежда->Пижамы,Одежда,Женская одежда
...,...,...,...,...,...,...,...,...,...,...
91115,114402,14922,False,4955,СТЕКЛОФФ ПРО,3.62069,"{""title"": ""Прочное стекло 2D на Samsung Galaxy...",Аксессуары и запчасти->Защитные стекла и пленк...,Электроника,Смартфоны и телефоны
91116,1594500,13028,False,19626,Hobby room,5.00000,"{""title"": ""Алмазная мозаика \""Ромашки\"" 40*50с...",Алмазные мозаики,Хобби и творчество,"Пазлы, мозаика и фреска"
91117,790493,13407,False,22291,Море открыток,5.00000,"{""title"": ""Открытка \""Вместе навсегда\"" в краф...",Открытки и конверты->Открытки,Товары для дома,Товары для праздников
91118,114509,12100,False,2985,Oppa Market,5.00000,"{""title"": ""Пульт K10B-C1 для Rolsen"", ""descrip...",Оборудование для телевизоров->Пульты ДУ,Электроника,Телевизоры и видеотехника


In [4]:
import json
def parse_desc(text):
    titles = []
    desc = []
    for i, doc in enumerate(text):
        titles.append(json.loads(doc)['title'])
        desc.append(json.loads(doc)['description'])
    return titles, desc

In [5]:
data['title'], data['description'] = parse_desc(list(data['text_fields']))
data.drop(columns=['text_fields'], inplace = True)

In [6]:
data = data.drop(columns=['cat_2',
                   'category_name',
                   'rating',
                   'shop_title',
                   'sale',
                   'product_id',
                   'category_id'])

In [7]:
def clean_description(corpus): # функция для очистки значений как <что-то>
    
    to_output = True
    out_str = ''
    output = []
    
    for text in corpus:
        for ch in text:
            
            if ch == '<':
                to_output = False
                
            if ch == '>':
                to_output = True
                continue
                
            if to_output:
                out_str += ch
                
        output.append(out_str)
        out_str = ''
    
    return output
data['description'] = clean_description(list(data['description']))

In [8]:
data['cat_1'], _ = data['cat_1'].factorize()

In [9]:
import nltk
from nltk import word_tokenize
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline
import string
import numpy as np
import pymorphy2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vnvof\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vnvof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
ru_sw = stopwords.words("russian") # ru stop words
snowball = SnowballStemmer(language = "russian")# stemming
morph = pymorphy2.MorphAnalyzer()

def token_all_proccesing(df: str, remove_stop_words: bool = True): # func
    output = ''
    tokens = word_tokenize(df, language="russian") # tokenize
    tokens = [i for i in tokens if i not in string.punctuation] # tokenize w/o punctuation
    if remove_stop_words:
        tokens = [i for i in tokens if i not in ru_sw]# remove ru stopword
   #tokens = [snowball.stem(i) for i in tokens] # snowball stemming
    tokens = [morph.parse(i)[0].normal_form for i in tokens] # lemmatization
    
    
    for i in tokens:
        output += i + ' '
    
    return output

In [11]:
data['description'] = data['description'].apply(func = lambda x: token_all_proccesing((x)))

# Models Traininig

## Catboost

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split (data, test_size= 0.2, random_state= 14)

In [24]:
train

Unnamed: 0,shop_id,cat_1,title,description
49942,11132,0,Штекер 3.5*1.1*9.5мм,размер 3.5х1.1х9.5ммспособ монтаж кабельфункци...
53915,13422,3,Прихватка Этель Серпента черный 18х18 100% хлопок,весь известно сколько время проводить кухня ка...
6553,22374,3,"Кухонное полотенце махровое 30х50 см ""Триада"" ...",махровый кухонный полотенце premier textile мя...
15668,19438,1,Джинсы женские белые,джинсы белые женский американка
69430,20731,2,"Набор для изготовления мыла ""Мыло Craft Dino"" ...",с помощь удивительный набор « мыло dino craft ...
...,...,...,...,...
89446,23185,4,"Ботинки женские, Юничел",материал верх текстиль натуральный кожавид кож...
44793,10580,2,"""Японская маска""-картина по номерам 50х40",набор укомплектовать весь необходимый начало р...
22855,16144,2,Альбом для монет,альбом предназначить монета обеспечивать сохра...
9484,35692,4,Кроссовки женские весна лето демисезонные белы...,кроссовок женский незаменимый вещь гардероб мо...


In [13]:
from catboost import CatBoostClassifier

text_data = ['description', 'title']

model_catboost = CatBoostClassifier(
                           task_type="GPU",
                           devices='0:1',
                           text_features= text_data,
                           depth = 12,
                           iterations = 1000)
model_catboost.fit(train.drop(columns = ['cat_1']), train['cat_1'])

  from pandas import MultiIndex, Int64Index


Learning rate set to 0.155695
0:	learn: 1.0923728	total: 346ms	remaining: 5m 45s
1:	learn: 0.8469029	total: 759ms	remaining: 6m 18s
2:	learn: 0.6852565	total: 1.12s	remaining: 6m 11s
3:	learn: 0.5690383	total: 1.43s	remaining: 5m 56s
4:	learn: 0.4826458	total: 1.8s	remaining: 5m 57s
5:	learn: 0.4143266	total: 2.1s	remaining: 5m 47s
6:	learn: 0.3600895	total: 2.42s	remaining: 5m 43s
7:	learn: 0.3173358	total: 2.88s	remaining: 5m 57s
8:	learn: 0.2817271	total: 3.21s	remaining: 5m 53s
9:	learn: 0.2525479	total: 3.66s	remaining: 6m 2s
10:	learn: 0.2283240	total: 4.03s	remaining: 6m 2s
11:	learn: 0.2075513	total: 4.43s	remaining: 6m 5s
12:	learn: 0.1904229	total: 4.91s	remaining: 6m 12s
13:	learn: 0.1761564	total: 5.43s	remaining: 6m 22s
14:	learn: 0.1637054	total: 5.9s	remaining: 6m 27s
15:	learn: 0.1529001	total: 6.35s	remaining: 6m 30s
16:	learn: 0.1438547	total: 6.83s	remaining: 6m 35s
17:	learn: 0.1355486	total: 7.19s	remaining: 6m 32s
18:	learn: 0.1286528	total: 7.57s	remaining: 6m 31

158:	learn: 0.0597450	total: 37.9s	remaining: 3m 20s
159:	learn: 0.0596636	total: 38.1s	remaining: 3m 20s
160:	learn: 0.0595943	total: 38.4s	remaining: 3m 19s
161:	learn: 0.0595159	total: 38.6s	remaining: 3m 19s
162:	learn: 0.0594225	total: 38.8s	remaining: 3m 19s
163:	learn: 0.0593482	total: 39s	remaining: 3m 18s
164:	learn: 0.0592776	total: 39.2s	remaining: 3m 18s
165:	learn: 0.0591244	total: 39.4s	remaining: 3m 17s
166:	learn: 0.0590696	total: 39.6s	remaining: 3m 17s
167:	learn: 0.0589706	total: 39.8s	remaining: 3m 17s
168:	learn: 0.0588184	total: 40s	remaining: 3m 16s
169:	learn: 0.0587230	total: 40.2s	remaining: 3m 16s
170:	learn: 0.0586597	total: 40.4s	remaining: 3m 15s
171:	learn: 0.0585529	total: 40.6s	remaining: 3m 15s
172:	learn: 0.0584940	total: 40.8s	remaining: 3m 15s
173:	learn: 0.0583830	total: 41s	remaining: 3m 14s
174:	learn: 0.0582801	total: 41.2s	remaining: 3m 14s
175:	learn: 0.0581332	total: 41.4s	remaining: 3m 14s
176:	learn: 0.0580261	total: 41.7s	remaining: 3m 13s

314:	learn: 0.0483051	total: 1m 10s	remaining: 2m 32s
315:	learn: 0.0482712	total: 1m 10s	remaining: 2m 31s
316:	learn: 0.0482092	total: 1m 10s	remaining: 2m 31s
317:	learn: 0.0481742	total: 1m 10s	remaining: 2m 31s
318:	learn: 0.0481411	total: 1m 10s	remaining: 2m 31s
319:	learn: 0.0481069	total: 1m 10s	remaining: 2m 30s
320:	learn: 0.0480641	total: 1m 11s	remaining: 2m 30s
321:	learn: 0.0480081	total: 1m 11s	remaining: 2m 30s
322:	learn: 0.0479639	total: 1m 11s	remaining: 2m 30s
323:	learn: 0.0479077	total: 1m 11s	remaining: 2m 29s
324:	learn: 0.0478546	total: 1m 12s	remaining: 2m 29s
325:	learn: 0.0478190	total: 1m 12s	remaining: 2m 29s
326:	learn: 0.0477716	total: 1m 12s	remaining: 2m 29s
327:	learn: 0.0476938	total: 1m 12s	remaining: 2m 28s
328:	learn: 0.0476379	total: 1m 12s	remaining: 2m 28s
329:	learn: 0.0475918	total: 1m 12s	remaining: 2m 28s
330:	learn: 0.0475284	total: 1m 13s	remaining: 2m 27s
331:	learn: 0.0474667	total: 1m 13s	remaining: 2m 27s
332:	learn: 0.0474150	total:

467:	learn: 0.0417471	total: 1m 40s	remaining: 1m 54s
468:	learn: 0.0417060	total: 1m 40s	remaining: 1m 54s
469:	learn: 0.0416852	total: 1m 41s	remaining: 1m 53s
470:	learn: 0.0416440	total: 1m 41s	remaining: 1m 53s
471:	learn: 0.0415948	total: 1m 41s	remaining: 1m 53s
472:	learn: 0.0415608	total: 1m 41s	remaining: 1m 53s
473:	learn: 0.0415143	total: 1m 41s	remaining: 1m 53s
474:	learn: 0.0414892	total: 1m 42s	remaining: 1m 52s
475:	learn: 0.0414483	total: 1m 42s	remaining: 1m 52s
476:	learn: 0.0414185	total: 1m 42s	remaining: 1m 52s
477:	learn: 0.0413783	total: 1m 42s	remaining: 1m 52s
478:	learn: 0.0413558	total: 1m 42s	remaining: 1m 51s
479:	learn: 0.0413299	total: 1m 43s	remaining: 1m 51s
480:	learn: 0.0413015	total: 1m 43s	remaining: 1m 51s
481:	learn: 0.0412738	total: 1m 43s	remaining: 1m 51s
482:	learn: 0.0412582	total: 1m 43s	remaining: 1m 50s
483:	learn: 0.0412170	total: 1m 43s	remaining: 1m 50s
484:	learn: 0.0411661	total: 1m 44s	remaining: 1m 50s
485:	learn: 0.0411249	total:

621:	learn: 0.0367460	total: 2m 11s	remaining: 1m 20s
622:	learn: 0.0367057	total: 2m 11s	remaining: 1m 19s
623:	learn: 0.0366802	total: 2m 12s	remaining: 1m 19s
624:	learn: 0.0366621	total: 2m 12s	remaining: 1m 19s
625:	learn: 0.0366303	total: 2m 12s	remaining: 1m 19s
626:	learn: 0.0365957	total: 2m 12s	remaining: 1m 18s
627:	learn: 0.0365624	total: 2m 12s	remaining: 1m 18s
628:	learn: 0.0365235	total: 2m 13s	remaining: 1m 18s
629:	learn: 0.0364979	total: 2m 13s	remaining: 1m 18s
630:	learn: 0.0364624	total: 2m 13s	remaining: 1m 18s
631:	learn: 0.0363959	total: 2m 13s	remaining: 1m 17s
632:	learn: 0.0363831	total: 2m 13s	remaining: 1m 17s
633:	learn: 0.0363626	total: 2m 14s	remaining: 1m 17s
634:	learn: 0.0363396	total: 2m 14s	remaining: 1m 17s
635:	learn: 0.0363164	total: 2m 14s	remaining: 1m 17s
636:	learn: 0.0362849	total: 2m 14s	remaining: 1m 16s
637:	learn: 0.0362726	total: 2m 14s	remaining: 1m 16s
638:	learn: 0.0362436	total: 2m 15s	remaining: 1m 16s
639:	learn: 0.0362261	total:

777:	learn: 0.0325253	total: 2m 43s	remaining: 46.5s
778:	learn: 0.0325092	total: 2m 43s	remaining: 46.3s
779:	learn: 0.0324916	total: 2m 43s	remaining: 46.1s
780:	learn: 0.0324751	total: 2m 43s	remaining: 45.9s
781:	learn: 0.0324515	total: 2m 43s	remaining: 45.7s
782:	learn: 0.0324357	total: 2m 43s	remaining: 45.4s
783:	learn: 0.0324174	total: 2m 44s	remaining: 45.2s
784:	learn: 0.0323850	total: 2m 44s	remaining: 45s
785:	learn: 0.0323613	total: 2m 44s	remaining: 44.8s
786:	learn: 0.0323459	total: 2m 44s	remaining: 44.6s
787:	learn: 0.0323174	total: 2m 44s	remaining: 44.4s
788:	learn: 0.0322979	total: 2m 45s	remaining: 44.2s
789:	learn: 0.0322039	total: 2m 45s	remaining: 44s
790:	learn: 0.0321752	total: 2m 45s	remaining: 43.8s
791:	learn: 0.0321502	total: 2m 45s	remaining: 43.6s
792:	learn: 0.0321296	total: 2m 46s	remaining: 43.3s
793:	learn: 0.0321154	total: 2m 46s	remaining: 43.1s
794:	learn: 0.0320875	total: 2m 46s	remaining: 42.9s
795:	learn: 0.0320689	total: 2m 46s	remaining: 42.

934:	learn: 0.0290654	total: 3m 14s	remaining: 13.5s
935:	learn: 0.0290299	total: 3m 14s	remaining: 13.3s
936:	learn: 0.0290119	total: 3m 14s	remaining: 13.1s
937:	learn: 0.0289982	total: 3m 14s	remaining: 12.9s
938:	learn: 0.0289601	total: 3m 15s	remaining: 12.7s
939:	learn: 0.0289373	total: 3m 15s	remaining: 12.5s
940:	learn: 0.0289136	total: 3m 15s	remaining: 12.3s
941:	learn: 0.0288972	total: 3m 15s	remaining: 12.1s
942:	learn: 0.0288814	total: 3m 15s	remaining: 11.8s
943:	learn: 0.0288509	total: 3m 16s	remaining: 11.6s
944:	learn: 0.0288434	total: 3m 16s	remaining: 11.4s
945:	learn: 0.0288235	total: 3m 16s	remaining: 11.2s
946:	learn: 0.0288144	total: 3m 16s	remaining: 11s
947:	learn: 0.0288054	total: 3m 16s	remaining: 10.8s
948:	learn: 0.0287878	total: 3m 17s	remaining: 10.6s
949:	learn: 0.0287784	total: 3m 17s	remaining: 10.4s
950:	learn: 0.0287559	total: 3m 17s	remaining: 10.2s
951:	learn: 0.0287382	total: 3m 17s	remaining: 9.97s
952:	learn: 0.0287189	total: 3m 17s	remaining: 9

<catboost.core.CatBoostClassifier at 0x27ff9dbbd90>

In [14]:
predicted_catboost = model_catboost.predict(test.drop(columns = ['cat_1']))

## Log. Reg.

In [15]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer
import string

ru_sw = stopwords.words("russian") # ru stop words
snowball = SnowballStemmer(language = "russian")# stemming

def token_all_proccesing(df: str, remove_stop_words: bool = True): # func
    tokens = word_tokenize(df, language="russian") # tokenize
    tokens = [i for i in tokens if i not in string.punctuation] # tokenize w/o punctuation
    if remove_stop_words:
        tokens = [i for i in tokens if i not in ru_sw]# remove ru stopword
    tokens = [snowball.stem(i) for i in tokens] # snowball stemming
    return tokens

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline([("vectorizer",TfidfVectorizer(tokenizer= lambda x:token_all_proccesing(x, remove_stop_words= True),
                                                        max_features=1000,))
                                                        ,("model",LogisticRegression(random_state = 228, max_iter = 1000) )])
model_pipeline.fit(train['description'], train['cat_1'])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_features=1000,
                                 tokenizer=<function <lambda> at 0x0000027FFFE69820>)),
                ('model', LogisticRegression(max_iter=1000, random_state=228))])

In [17]:
predicted_logreg = model_pipeline.predict(test['description'])

# Scoring

In [18]:
from sklearn.metrics import f1_score
print ('Catboost F1 - ')
print (f1_score(predicted_catboost, test['cat_1'].values, average = 'weighted')) # 0.9795713810

Catboost F1 - 
0.9797209590213257


In [19]:
from sklearn.metrics import f1_score
print ('Log.Reg. F1 - ')
print (f1_score(predicted_logreg, test['cat_1'].values, average = 'weighted'))

Log.Reg. F1 - 
0.9065964772914754


In [20]:
model_catboost.save_model('models/model_catboost_big_cat',
           format="cbm",
           export_parameters=None,
           pool=None)