In [1]:
import sys
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path
import warnings
import config
import preprocess

from collections import defaultdict
from catboost import CatBoostClassifier, Pool

sys.path.append('./cython_loss')
import pyximport; pyximport.install()
import cython1
from cython1 import LoglossObjective_cython

warnings.filterwarnings("ignore", category=DeprecationWarning)

pd.set_option('display.max_rows', 120)
plt.style.use("dark_background")
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams["hist.bins"] = 50
print('cur_dir', Path.cwd())

cur_dir /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Otus/ML_advanced2020/final_project


In [2]:
data = pd.read_csv("/home/sergey/mnt/4.5Tb/Downloads/otus_final_input/data_encoded.csv",
                   low_memory=False,
                   ).astype(config.dtypes)
data = preprocess.preprocess(data)

X_val = data.groupby(data.Client_Id).tail(1)
y_val = X_val.pop('target')
X_train = data.drop(labels=X_val.index)
y_train = X_train.pop('target')

In [3]:
client_id_activity = X_train.groupby(data.Client_Id).agg(['count', 'mean', 'median'])
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)

    X_train = X_train.merge(client_id_activity, how='left', left_on='Client_Id', right_index=True).dropna()
    X_val = X_val.merge(client_id_activity, how='left', left_on='Client_Id', right_index=True)  # .dropna()
# y_val = y_val[X_val.index]

train_pool = Pool(data=X_train, label=y_train,
                  cat_features=config.categorical_dtypes_ft.keys())
val_pool = Pool(data=X_val, label=y_val,
                cat_features=config.categorical_dtypes_ft.keys())

## родная Logloss, catboost

In [4]:
%%time

iterations = 5
params = config.params
params.update({
    'loss_function': 'Logloss',
    'iterations': iterations,
    'verbose': iterations // 5,
})
clf_ref_logloss = CatBoostClassifier(**params)
clf_ref_logloss.fit(X=train_pool,
        eval_set=val_pool,
        use_best_model=True)


0:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 223ms	remaining: 892ms
1:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 402ms	remaining: 603ms
2:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 598ms	remaining: 399ms
3:	learn: 0.7246614	test: 0.6145145	best: 0.6145145 (3)	total: 769ms	remaining: 192ms
4:	learn: 0.7246614	test: 0.6145145	best: 0.6145145 (3)	total: 974ms	remaining: 0us

bestTest = 0.6145144846
bestIteration = 3

Shrink model to first 4 iterations.
CPU times: user 3.98 s, sys: 244 ms, total: 4.23 s
Wall time: 1.64 s


<catboost.core.CatBoostClassifier at 0x7f76a5275c40>

## внешняя  Logloss, catboost, Cython

In [5]:
%%time
iterations = 5
params = config.params
params.update({
    'loss_function': LoglossObjective_cython(),
    'iterations': iterations,
    'verbose': iterations // 5,
})
clf_ref_logloss = CatBoostClassifier(**params)
clf_ref_logloss.fit(X=train_pool,
        eval_set=val_pool,
        use_best_model=True)


0:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 1.46s	remaining: 5.84s
1:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 2.92s	remaining: 4.38s
2:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 4.4s	remaining: 2.93s
3:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 5.8s	remaining: 1.45s
4:	learn: 0.7196265	test: 0.6101332	best: 0.6101332 (0)	total: 7.27s	remaining: 0us

bestTest = 0.6101332138
bestIteration = 0

Shrink model to first 1 iterations.
CPU times: user 10.7 s, sys: 1.13 s, total: 11.8 s
Wall time: 7.82 s


<catboost.core.CatBoostClassifier at 0x7f76706134f0>