In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode

from catboost import CatBoostClassifier, Pool
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm import early_stopping

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_columns', None)

# если у вас есть CUDA, то она понадобится там для экспериментов в catboost
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

task_type = 'GPU'
if platform.node() == 'VLAD2016':
    task_type = 'CPU'
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

DEBUG = False
targets = [
    'Релевантность', 
    'Таксономия релевантные', 
    'Таксономия не релевантные', 
    'Длина отзыва', 
    'Ценности'
]

target = 'Таксономия не релевантные'

In [2]:
dtrain = pd.read_csv('../input/train_dataset_train.csv')
dtrain.head(3)

Unnamed: 0,RecordNo,Название книги,Автор,Ссылка на литрес,Рейтинг,Количество оценок,Количество отзывов,Имя читателя,Оценка книги читателем (из 5 баллов),Отзыв,Лайки на отзыв,Дислайки на отзыв,Релевантность,Таксономия релевантные,Таксономия не релевантные,Длина отзыва,Ценности
0,6145,Зулейха открывает глаза,Гузель Яхина,https://www.litres.ru/guzel-yahina/zuleyha-otk...,4.7,3922,408,Айгуль Ляпина,5.0,Рекомендую книгу в прочтению/прослушиванию. Ес...,0,3,0,0,0,0,0
1,7006,Зулейха открывает глаза,Гузель Яхина,https://www.litres.ru/guzel-yahina/zuleyha-otk...,4.6,24719,2103,Olga T,5.0,"Удивительно, что сейчас возникает ТАКАЯ литера...",0,1,0,0,0,0,1
2,1124,Дети мои,Гузель Яхина,https://www.litres.ru/guzel-yahina/deti-moi/,4.4,8032,702,Кирилл Чириков,5.0,"Душевно, жизненно, чувственно, проникновенно!!...",0,0,0,1,0,0,1


In [3]:
for u in sorted(dtrain[target].unique()):
    print(u, dtrain[dtrain[target] == u].shape[0])

0 4225
1 614


In [4]:
dtest = pd.read_csv(
    '../input/test_dataset_test.csv',
    na_values = 'NULL'
)
dtest.head(3)

Unnamed: 0,RecordNo,Название книги,Автор,Ссылка на литрес,Рейтинг,Количество оценок,Количество отзывов,Имя читателя,Оценка книги читателем (из 5 баллов),Отзыв,Лайки на отзыв,Дислайки на отзыв
0,3366,Дети мои,Гузель Яхина,https://www.litres.ru/guzel-yahina/deti-moi/,4.4,8032,702,Марина Ефимкина,5.0,"Настоящая глубокая книга, коих сейчас очень не...",2,0
1,3952,Текст,Дмитрий Глуховский,https://www.litres.ru/dmitriy-gluhovskiy/tekst...,4.5,1923,246,alexvarp,5.0,Одна из лучших книг прочитанных за последние г...,1,0
2,6852,Текст,Дмитрий Глуховский,https://www.litres.ru/dmitriy-gluhovskiy/tekst/,4.4,7276,622,fb_154207611938008,5.0,"Тот случай, когда невозможно пройти мимо, не о...",0,0


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

enc = OneHotEncoder(handle_unknown = 'ignore')
le  = LabelEncoder()
L = dtrain.shape[0]
categorical_columns = [
    'Название книги', 'Автор', 'Имя читателя'
]

for u in categorical_columns:
    print(u, len(dtrain[u].unique()))
    
    if len(dtrain[u].unique()) < 1:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        df[u] = df[u] + '_' + u
        
        temp = enc.fit_transform(df[u].values.reshape(-1, 1)).toarray()
        for i in range(temp.shape[1]):
            n = u + '_' + str(i)
            dtrain[n] = temp[:, i][:L]
            dtest[n] = temp[:, i][L:]
            if n not in use:
                use.append(n)        
        if u in use:
            use.remove(u)       
    
        dtrain.drop(u, axis = 1, inplace = True)
        dtest.drop(u, axis = 1, inplace = True)
    
    else:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        le.fit(df[u].values.ravel())
        temp = le.transform(df[u].values.ravel())
        dtrain[u] = temp[:L]
        dtest[u]  = temp[L:]        
    
gc.collect()

Название книги 103
Автор 59
Имя читателя 4263


36

In [6]:
use = [f for f in dtrain.columns if f not in targets]
use = [f for f in use if f not in ['Ссылка на литрес']]

In [7]:
text_processing = {
            "tokenizers" : [{
                "tokenizer_id" : "Space",
                "separator_type" : "ByDelimiter",
                "delimiter" : " "
            }],

            "dictionaries" : [{
                "dictionary_id" : "BiGram",
                "token_level_type": "Letter",
                "max_dictionary_size" : "150000",
                "occurrence_lower_bound" : "1",
                "gram_order" : "2"
            },{
                "dictionary_id" : "Trigram",
                "max_dictionary_size" : "150000",
                "token_level_type": "Letter",
                "occurrence_lower_bound" : "1",
                "gram_order" : "3"
            },{
                "dictionary_id" : "Fourgram",
                "max_dictionary_size" : "150000",
                "token_level_type": "Letter",
                "occurrence_lower_bound" : "1",
                "gram_order" : "4"
            },{
                "dictionary_id" : "Fivegram",
                "max_dictionary_size" : "150000",
                "token_level_type": "Letter",
                "occurrence_lower_bound" : "1",
                "gram_order" : "5"
            },{
                "dictionary_id" : "Sixgram",
                "max_dictionary_size" : "150000",
                "token_level_type": "Letter",
            "occurrence_lower_bound" : "1",
            "gram_order" : "6"
            }
            ],

            "feature_processing" : {
                "default" : [
                        {
                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                        "feature_calcers" : ["BoW"],
                        "tokenizers_names" : ["Space"]
                    },
                        {
                    "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                    "feature_calcers" : ["NaiveBayes"],
                    "tokenizers_names" : ["Space"]
                },{
                    "dictionaries_names" : [ "BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                    "feature_calcers" : ["BM25"],
                    "tokenizers_names" : ["Space"]
                },
                ],
            }
        }

In [8]:
SEEDS = 3
iterations = 10000
early_stopping_rounds = 500
preds = []
    
    
for seed in range(SEEDS):
    
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
    
    for train_index, test_index in skf.split(dtrain, dtrain[target]):
        X_train, X_valid = dtrain.loc[train_index], dtrain.loc[test_index]       
        
        model = CatBoostClassifier(
            text_features = ['Отзыв'],
            iterations = iterations,
            depth = 5, 
            learning_rate = 0.1,
            reg_lambda = 0.1,
            loss_function = 'MultiClass',
            task_type = task_type,
            random_state = seed,
            early_stopping_rounds = early_stopping_rounds,
            verbose = 500,
            text_processing = text_processing        
        )
        model.fit(
            X_train[use],
            X_train[target],
            eval_set = (X_valid[use], X_valid[target]),
        ) 
        preds.append(model.predict(dtest[use]))
    
preds = stats.mode(preds)[0][0]
dtest[target] = preds

0:	learn: 0.6115204	test: 0.6121881	best: 0.6121881 (0)	total: 6.52ms	remaining: 1m 5s
500:	learn: 0.0088350	test: 0.1143097	best: 0.1084603 (292)	total: 1.89s	remaining: 35.9s
bestTest = 0.1084603239
bestIteration = 292
Shrink model to first 293 iterations.
0:	learn: 0.6121313	test: 0.6130781	best: 0.6130781 (0)	total: 6.04ms	remaining: 1m
500:	learn: 0.0068883	test: 0.1705520	best: 0.1372497 (110)	total: 1.87s	remaining: 35.5s
bestTest = 0.1372497338
bestIteration = 110
Shrink model to first 111 iterations.
0:	learn: 0.6128875	test: 0.6123801	best: 0.6123801 (0)	total: 6.52ms	remaining: 1m 5s
500:	learn: 0.0094194	test: 0.0970488	best: 0.0928393 (356)	total: 1.92s	remaining: 36.5s
bestTest = 0.0928392962
bestIteration = 356
Shrink model to first 357 iterations.
0:	learn: 0.6126733	test: 0.6120071	best: 0.6120071 (0)	total: 6.2ms	remaining: 1m 1s
500:	learn: 0.0091527	test: 0.1013390	best: 0.0985613 (243)	total: 1.85s	remaining: 35.2s
bestTest = 0.0985613421
bestIteration = 243
Shrink

In [9]:
dtest[['RecordNo'] + [target]].head()

Unnamed: 0,RecordNo,Таксономия не релевантные
0,3366,0
1,3952,0
2,6852,0
3,4586,0
4,4677,0


In [10]:
dtest[['RecordNo'] + [target]].to_csv('03.csv', index = False)

In [11]:
for u in sorted(dtest[target].unique()):
    print(u, dtest[dtest[target] == u].shape[0])

0 1837
1 238


https://lk.hacks-ai.ru/758289/champ