In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode

from catboost import CatBoostClassifier, Pool
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm import early_stopping

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


#from skopt.space import Real, Categorical, Integer
#from skopt.utils import use_named_args
#from skopt import gp_minimize


pd.set_option('display.max_columns', None)

# если у вас есть CUDA, то она понадобится там для экспериментов в catboost
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

task_type = 'GPU'
if platform.node() == 'VLAD2016':
    task_type = 'CPU'
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

DEBUG = False
targets = [
    'Релевантность', 
    'Таксономия релевантные', 
    'Таксономия не релевантные', 
    'Длина отзыва', 
    'Ценности'
]

target = 'Таксономия релевантные'

In [2]:
dtrain = pd.read_csv('../input/train_dataset_train.csv')
dtrain.head(3)

Unnamed: 0,RecordNo,Название книги,Автор,Ссылка на литрес,Рейтинг,Количество оценок,Количество отзывов,Имя читателя,Оценка книги читателем (из 5 баллов),Отзыв,Лайки на отзыв,Дислайки на отзыв,Релевантность,Таксономия релевантные,Таксономия не релевантные,Длина отзыва,Ценности
0,6145,Зулейха открывает глаза,Гузель Яхина,https://www.litres.ru/guzel-yahina/zuleyha-otk...,4.7,3922,408,Айгуль Ляпина,5.0,Рекомендую книгу в прочтению/прослушиванию. Ес...,0,3,0,0,0,0,0
1,7006,Зулейха открывает глаза,Гузель Яхина,https://www.litres.ru/guzel-yahina/zuleyha-otk...,4.6,24719,2103,Olga T,5.0,"Удивительно, что сейчас возникает ТАКАЯ литера...",0,1,0,0,0,0,1
2,1124,Дети мои,Гузель Яхина,https://www.litres.ru/guzel-yahina/deti-moi/,4.4,8032,702,Кирилл Чириков,5.0,"Душевно, жизненно, чувственно, проникновенно!!...",0,0,0,1,0,0,1


In [3]:
for u in sorted(dtrain[target].unique()):
    print(u, dtrain[dtrain[target] == u].shape[0])

0 2360
1 2479


In [4]:
dtest = pd.read_csv(
    '../input/test_dataset_test.csv',
    na_values = 'NULL'
)
dtest.head(3)

Unnamed: 0,RecordNo,Название книги,Автор,Ссылка на литрес,Рейтинг,Количество оценок,Количество отзывов,Имя читателя,Оценка книги читателем (из 5 баллов),Отзыв,Лайки на отзыв,Дислайки на отзыв
0,3366,Дети мои,Гузель Яхина,https://www.litres.ru/guzel-yahina/deti-moi/,4.4,8032,702,Марина Ефимкина,5.0,"Настоящая глубокая книга, коих сейчас очень не...",2,0
1,3952,Текст,Дмитрий Глуховский,https://www.litres.ru/dmitriy-gluhovskiy/tekst...,4.5,1923,246,alexvarp,5.0,Одна из лучших книг прочитанных за последние г...,1,0
2,6852,Текст,Дмитрий Глуховский,https://www.litres.ru/dmitriy-gluhovskiy/tekst/,4.4,7276,622,fb_154207611938008,5.0,"Тот случай, когда невозможно пройти мимо, не о...",0,0


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

enc = OneHotEncoder(handle_unknown = 'ignore')
le  = LabelEncoder()
L = dtrain.shape[0]
categorical_columns = [
    'Название книги', 'Автор', 'Имя читателя'
]

for u in categorical_columns:
    print(u, len(dtrain[u].unique()))
    
    if len(dtrain[u].unique()) < 1:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        df[u] = df[u] + '_' + u
        
        temp = enc.fit_transform(df[u].values.reshape(-1, 1)).toarray()
        for i in range(temp.shape[1]):
            n = u + '_' + str(i)
            dtrain[n] = temp[:, i][:L]
            dtest[n] = temp[:, i][L:]
            if n not in use:
                use.append(n)        
        if u in use:
            use.remove(u)       
    
        dtrain.drop(u, axis = 1, inplace = True)
        dtest.drop(u, axis = 1, inplace = True)
    
    else:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        le.fit(df[u].values.ravel())
        temp = le.transform(df[u].values.ravel())
        dtrain[u] = temp[:L]
        dtest[u]  = temp[L:]        
    
gc.collect()

Название книги 103
Автор 59
Имя читателя 4263


36

In [6]:
use = [f for f in dtrain.columns if f not in targets]
use = [f for f in use if f not in ['Ссылка на литрес']]

In [7]:
vectorizer = TfidfVectorizer(
    lowercase = True, 
    preprocessor = None, 
    tokenizer = None, 
    analyzer = 'char', 
    stop_words = None, 
    token_pattern = None, 
    ngram_range = (1, 5), 
    max_df = 1.0, 
    min_df = 1, 
    max_features = 50000,
)
train = vectorizer.fit_transform(dtrain['Отзыв'].values)
test  = vectorizer.transform(dtest['Отзыв'].values)

In [8]:
train.shape

(4839, 50000)

In [9]:
from scipy.sparse import coo_matrix, vstack, hstack

for u in use:
    if u != 'Отзыв':
        train = hstack([train, dtrain[u][:,None]])
        test  = hstack([test, dtest[u][:,None]])

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train,
    dtrain[target],
    test_size = 0.1, 
    random_state = 2,
    stratify = dtrain[target]
)

print(X_train.shape, X_valid.shape)
print(y_train.shape, y_valid.shape)

(4355, 50010) (484, 50010)
(4355,) (484,)


In [11]:
train = train.tocsr()
test = test.tocsr()
cols = train.max(axis=0)
cols

<1x50010 sparse matrix of type '<class 'numpy.float64'>'
	with 50010 stored elements in COOrdinate format>

In [12]:
SEEDS = 3
iterations = 10000
early_stopping_rounds = 500
preds = []
    
    
for seed in range(SEEDS):
    
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
    
    for train_index, test_index in skf.split(dtrain, dtrain[target]):
        
        X_train, X_valid = train[train_index], train[test_index]
        y_train, y_valid = dtrain[target].loc[train_index], dtrain[target].loc[test_index]       
        
        model = CatBoostClassifier(
            iterations = iterations,
            depth = 5, 
            learning_rate = 0.1,
            reg_lambda = 0.1,
            loss_function = 'MultiClass',
            #eval_metric = 'TotalF1:average=Macro',
            #eval_metric = 'MC', 
            #loss_function = 'CrossEntropy',
            #eval_metric = 'Recall',
            #eval_metric = AccuracyMetric(),
            task_type = task_type,
            random_state = seed,
            early_stopping_rounds = early_stopping_rounds,
            verbose = 500,     
        )
        model.fit(
            X_train,
            y_train,
            eval_set = (X_valid, y_valid),
        ) 
        preds.append(model.predict(test))
    
preds = stats.mode(preds)[0][0]
dtest[target] = preds



0:	learn: 0.6327639	test: 0.6354899	best: 0.6354899 (0)	total: 166ms	remaining: 27m 39s
500:	learn: 0.0274626	test: 0.1325872	best: 0.1323698 (492)	total: 38.3s	remaining: 12m 6s
1000:	learn: 0.0067557	test: 0.1348814	best: 0.1305351 (585)	total: 1m 15s	remaining: 11m 20s
bestTest = 0.1305351415
bestIteration = 585
Shrink model to first 586 iterations.
0:	learn: 0.6338053	test: 0.6337294	best: 0.6337294 (0)	total: 149ms	remaining: 24m 52s
500:	learn: 0.0308338	test: 0.1082068	best: 0.1075954 (495)	total: 38.7s	remaining: 12m 13s
1000:	learn: 0.0076646	test: 0.0982942	best: 0.0978538 (974)	total: 1m 16s	remaining: 11m 26s
1500:	learn: 0.0028299	test: 0.1031865	best: 0.0970419 (1050)	total: 1m 53s	remaining: 10m 43s
bestTest = 0.09704187094
bestIteration = 1050
Shrink model to first 1051 iterations.
0:	learn: 0.6333555	test: 0.6350420	best: 0.6350420 (0)	total: 141ms	remaining: 23m 33s
500:	learn: 0.0297013	test: 0.1223240	best: 0.1223240 (500)	total: 38.6s	remaining: 12m 11s
1000:	learn

In [13]:
dtest[['RecordNo'] + [target]].head()

Unnamed: 0,RecordNo,Таксономия релевантные
0,3366,0
1,3952,1
2,6852,0
3,4586,0
4,4677,1


In [14]:
dtest[['RecordNo'] + [target]].to_csv('02.csv', index = False)

In [15]:
for u in sorted(dtest[target].unique()):
    print(u, dtest[dtest[target] == u].shape[0])

0 1100
1 975


https://lk.hacks-ai.ru/758289/champ