# Подключаем необходимые библиотеки

In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode

from catboost import CatBoostClassifier, Pool
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm import early_stopping

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from scipy import stats

from sklearn.preprocessing import LabelEncoder

# если у вас есть CUDA, то она понадобится там для экспериментов в catboost
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

task_type = 'GPU'
if platform.node() == 'VLAD2016':
    task_type = 'CPU'
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

DEBUG = False
target = 'groups'

# Загружаю данные и вывожу первые 3 строки

In [2]:
dtrain = pd.read_csv('../input2/train_dataset_train.csv')
dtrain.head(3)

Unnamed: 0,id,name,groups
0,1034,ШОК-ЦЕНА Пена д/душа/бритья КУППЕР 200 мл АКС,10
1,1035,Мин.вода Нагутская №26 0.5л,0
2,1036,Пельмени Домашние вес ПО Прямицино.,10


In [3]:
dtest = pd.read_csv(
    '../input2/test_dataset_test.csv',
    na_values = 'NULL'
)
dtest.head(3)

Unnamed: 0,id,name
0,10529,"КАРАМЕЛЬ ""ЛАКОМКА-СУПЕРМОЛОЧНАЯ"" ВЕС (РОТ ФРОНТ)"
1,10530,"2: 3637233 РАЭ Масло ТРАДИЦ.слив.82,5% 1"
2,10531,960012-LG NS Бинт эластичный Classic лаймовый...


# Небольшая обработка - удаляю лишние пробелы и привожу в нижний регистр

In [4]:
for df in [dtrain, dtest]:
    df['name'] = df['name'].apply(lambda x: str(x).strip(' '))
    df['name'] = df['name'].apply(lambda x: str(x).lower())

# Удаляю дубликаты

In [5]:
print(dtrain.shape)

dtrain['hash'] = dtrain['name'].apply(lambda x: hash(x))

temp = dtrain.drop_duplicates(subset = ['hash'])

dublicates = dtrain[~dtrain.index.isin(temp.index)]
dtrain = dtrain.drop_duplicates(subset = ['hash'])
dtrain.reset_index(drop = True, inplace = True)
print(dtrain.shape)

#dublicates

(658064, 3)
(658041, 4)


# Добавляю колонку "len" (длина текста в колонке "name"). Сортирую по "len". Смотрю и удивляюсь.

In [6]:
dtrain['len'] = dtrain['name'].apply(lambda x: len(x))
dtrain.sort_values(by = ['len'], inplace = True)
dtrain

Unnamed: 0,id,name,groups,hash,len
33749,34784,"9""",10,1994261734662351582,2
113586,114621,".""",10,4596263415246832009,2
495140,496188,"3""",3,374543380145529213,2
438973,440014,"е""",10,7620594383579161725,2
10206,11240,"г""",10,638295317084937686,2
...,...,...,...,...,...
42212,43247,биг мак сет (с чикен макнаггетс 9 шт.): биг ма...,10,-3853305765383967215,128
350580,351621,прокладки бумажно-беловые. натурелла макси 7...,10,-1675504357224737276,128
304663,305704,(23224) l14221 лампа lynx p21/5w s25 12v21/5w ...,10,-6738428353102454581,128
509247,510296,",молоко,0.0,,10.0,1.0,2021-02-16,г.севастополь...",3,-7368307349825764719,135


# То же самое и для тестовых данных.

In [7]:
dtest['len'] = dtest['name'].apply(lambda x: len(str(x)))
dtest.sort_values(by = ['len'], inplace = True)
dtest

Unnamed: 0,id,name,len
16254,26783,1,1
70332,80861,"1""",2
140344,150873,")""",2
193446,203975,aa,2
252412,262941,"а""",2
...,...,...,...
78631,89160,лапша 70 гр стакан роллтон горячая порция лапш...,128
213517,224046,и.п.постников ментос меллер в ассортиментеф...,128
265881,276410,"и.п.постников карамель ""chupa chups xxl trio ""...",128
159833,170362,компливит аква д3 15000ме/мл. 10мл фл/кап капл...,128


# Кодирую целевцю переменую. Смотрю распределение по классам.

In [11]:
le_target = LabelEncoder()
dtrain[target] = le_target.fit_transform(dtrain[target].values.ravel())


for u in sorted(dtrain[target].unique()):
    print(u, dtrain[dtrain[target] == u].shape[0])

0 21566
1 17136
2 3902
3 21045
4 27591
5 10007
6 12051
7 42002
8 502741


# Разделяю на тренировочную и проверочную часть

In [12]:
train, valid = train_test_split(
    dtrain,
    test_size = 0.1, 
    random_state = 2,
    stratify = dtrain[target]
)

print(train.shape)
print(valid.shape)

(592236, 5)
(65805, 5)


# Описываю обработчик текстовой информации на основе n-грамм.

In [1]:
text_processing = {
    "tokenizers" : [{
        "tokenizer_id" : "Space",
        "separator_type" : "ByDelimiter",
        "delimiter" : " "
    }],

    "dictionaries" : [{
        "dictionary_id" : "BiGram",
        "token_level_type": "Letter",
        "max_dictionary_size" : "150000",
        "occurrence_lower_bound" : "1",
        "gram_order" : "2"
    },{
        "dictionary_id" : "Trigram",
        "max_dictionary_size" : "250000",
        "token_level_type": "Letter",
        "occurrence_lower_bound" : "1",
        "gram_order" : "3"
    },{
        "dictionary_id" : "Fourgram",
        "max_dictionary_size" : "150000",
        "token_level_type": "Letter",
        "occurrence_lower_bound" : "1",
        "gram_order" : "4"
    },{
        "dictionary_id" : "Fivegram",
        "max_dictionary_size" : "150000",
        "token_level_type": "Letter",
        "occurrence_lower_bound" : "1",
        "gram_order" : "5"
    },{
        "dictionary_id" : "Sixgram",
        "max_dictionary_size" : "150000",
        "token_level_type": "Letter",
        "occurrence_lower_bound" : "1",
        "gram_order" : "6"
        }
    ],

    "feature_processing" : {
        "default" : [{
            "dictionaries_names" : [
                "BiGram", 
                "Trigram", 
                "Fourgram", 
                "Fivegram", 
                "Sixgram"
            ],
            "feature_calcers" : ["BoW"],
            "tokenizers_names" : ["Space"]
        },{
            "dictionaries_names" : [
                "BiGram", 
                "Trigram", 
                "Fourgram", 
                "Fivegram", 
                "Sixgram"
            ],
            "feature_calcers" : ["NaiveBayes"],
            "tokenizers_names" : ["Space"]
        },{
            "dictionaries_names" : [
                "BiGram", 
                "Trigram", 
                "Fourgram", 
                "Fivegram", 
                "Sixgram"
            ],
            "feature_calcers" : ["BM25"],
            "tokenizers_names" : ["Space"]
        },
        ],
    }
}

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [13]:
use = ['name']

# Тренировка 3-х моделей

In [14]:
iterations = 28000
early_stopping_rounds = 500

SEEDS = 3
models = []

for seed in range(SEEDS):

    model = CatBoostClassifier(
        text_features = ['name'],
        iterations = iterations,
        depth = 5, 
        learning_rate = 0.1,
        reg_lambda = 0.3,
        loss_function = 'MultiClass',
        task_type = task_type,
        random_state = seed,
        early_stopping_rounds = early_stopping_rounds,
        verbose = 500,
        text_processing = text_processing
    )

    model.fit(
        train[use],
        train[target],
        eval_set = (valid[use], valid[target]),
    )
    models.append(model)

0:	learn: 1.5520291	test: 1.5519688	best: 1.5519688 (0)	total: 107ms	remaining: 49m 50s
500:	learn: 0.0417686	test: 0.0412566	best: 0.0412566 (500)	total: 43.9s	remaining: 40m 10s
1000:	learn: 0.0284204	test: 0.0304261	best: 0.0304261 (1000)	total: 1m 26s	remaining: 38m 42s
1500:	learn: 0.0223959	test: 0.0258557	best: 0.0258557 (1500)	total: 2m 7s	remaining: 37m 24s
2000:	learn: 0.0186691	test: 0.0233675	best: 0.0233675 (2000)	total: 2m 47s	remaining: 36m 21s
2500:	learn: 0.0158690	test: 0.0217251	best: 0.0217241 (2499)	total: 3m 29s	remaining: 35m 31s
3000:	learn: 0.0137159	test: 0.0205666	best: 0.0205666 (3000)	total: 4m 10s	remaining: 34m 43s
3500:	learn: 0.0119721	test: 0.0195488	best: 0.0195488 (3500)	total: 4m 51s	remaining: 34m
4000:	learn: 0.0105564	test: 0.0187354	best: 0.0187354 (4000)	total: 5m 33s	remaining: 33m 20s
4500:	learn: 0.0093773	test: 0.0181174	best: 0.0181174 (4500)	total: 6m 14s	remaining: 32m 36s
5000:	learn: 0.0084099	test: 0.0176286	best: 0.0176284 (4999)	tot

13500:	learn: 0.0020993	test: 0.0153727	best: 0.0153712 (13493)	total: 18m 32s	remaining: 19m 55s
14000:	learn: 0.0019699	test: 0.0153528	best: 0.0153446 (13971)	total: 19m 13s	remaining: 19m 13s
14500:	learn: 0.0018563	test: 0.0153440	best: 0.0153435 (14499)	total: 19m 55s	remaining: 18m 32s
15000:	learn: 0.0017410	test: 0.0153505	best: 0.0153334 (14728)	total: 20m 36s	remaining: 17m 51s
bestTest = 0.01533341047
bestIteration = 14728
Shrink model to first 14729 iterations.


# Смотрю на качество модели на проверочной части.

In [15]:
valid_preds = []

for model in models:
    valid_preds.append(model.predict(valid[use]).ravel())
    
valid_preds = stats.mode(valid_preds)[0][0]

print(classification_report(valid[target], valid_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2157
           1       0.98      0.99      0.99      1714
           2       0.98      0.98      0.98       390
           3       0.99      1.00      1.00      2104
           4       1.00      1.00      1.00      2759
           5       0.96      0.97      0.96      1001
           6       0.97      0.98      0.97      1205
           7       1.00      1.00      1.00      4200
           8       1.00      1.00      1.00     50275

    accuracy                           1.00     65805
   macro avg       0.99      0.99      0.99     65805
weighted avg       1.00      1.00      1.00     65805



In [16]:
score = recall_score(valid[target], valid_preds, average = 'macro')
print("Recall score:", score)

Recall score: 0.9884951764356804


# Формируем и сохраняем файл решения

In [19]:
preds = []

for model in models:
    preds.append(model.predict(dtest[use]).ravel())
    
m = stats.mode(preds)[0][0]

In [20]:
dtest[target] = m
dtest[target] = dtest[target].astype(int)

In [21]:
dtest[target] = le_target.inverse_transform(dtest[target].values.ravel())
dtest[target] = dtest[target].astype(int)

In [22]:
dtest[['id', 'groups']].to_csv('final.csv', index = False)

In [23]:
dtest[['id', 'groups']].head()

Unnamed: 0,id,groups
16254,26783,10
70332,80861,10
140344,150873,10
193446,203975,10
252412,262941,10


In [24]:
for u in sorted(dtest[target].unique()):
    print(u, dtest[dtest[target] == u].shape[0])

0 9277
1 7553
2 1654
3 8989
4 11819
6 4383
7 5206
9 17843
10 215503


https://lk.hacks-ai.ru/758275/champ