In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_parquet('../data/train.parquet')
test = pd.read_parquet('../data/test.parquet')

In [3]:
X_train = train.drop(['target'], axis=1)
y_train = train['target']

X_test = test.drop(['target'], axis=1)
y_test = test['target']

In [4]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

### Without class weights

In [7]:
gbm = CatBoostClassifier(random_seed=42, iterations=30, early_stopping_rounds=10, thread_count=-1, custom_metric=['F1', 'Precision', 'Recall', 'AUC', 'Accuracy', 'Logloss'])

In [8]:
gbm.fit(train_pool, eval_set=test_pool, verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.0124651	test: 0.0116394	best: 0.0116394 (0)	total: 4.47s	remaining: 2m 9s
1:	learn: 0.0079809	test: 0.0068756	best: 0.0068756 (1)	total: 8.24s	remaining: 1m 55s
2:	learn: 0.0078043	test: 0.0067787	best: 0.0067787 (2)	total: 11.9s	remaining: 1m 46s
3:	learn: 0.0063653	test: 0.0055160	best: 0.0055160 (3)	total: 15.7s	remaining: 1m 41s
4:	learn: 0.0059201	test: 0.0051519	best: 0.0051519 (4)	total: 19.5s	remaining: 1m 37s
5:	learn: 0.0057397	test: 0.0050048	best: 0.0050048 (5)	total: 23.3s	remaining: 1m 33s
6:	learn: 0.0056858	test: 0.0049631	best: 0.0049631 (6)	total: 27.1s	remaining: 1m 28s
7:	learn: 0.0056052	test: 0.0048915	best: 0.0048915 (7)	total: 31.3s	remaining: 1m 25s
8:	learn: 0.0055876	test: 0.0048799	best: 0.0048799 (8)	total: 35.3s	remaining: 1m 22s
9:	learn: 0.0055802	test: 0.0048715	best: 0.0048715 (9)	total: 39.4s	remaining: 1m 18s
10:	learn: 0.0055600	test: 0.0048578	best: 0.0048578 (10)	total: 43.6s	remaining: 1m 15s
11:	learn: 0.0055

<catboost.core.CatBoostClassifier at 0x7feb67961990>

In [9]:
gbm.save_model('../models/gbm.cbm', format="cbm")

In [10]:
y_pred = gbm.predict(X_train)

In [9]:
def calculate_user_roc_auc(user_id, y, y_pred):

    user_data = pd.DataFrame({'user_id': user_id})
    user_data['y'] = y
    user_data['y_pred'] = y_pred

    # Инициализируем список для хранения ROC AUC по каждому пользователю
    user_roc_aucs = []

    # Группируем данные по user_id и вычисляем ROC AUC для каждого пользователя
    for user_id, group in user_data.groupby('user_id'):
        if len(group['y'].unique()) > 1:  # Проверяем, чтобы в выборке было больше одной метки
            roc_auc = roc_auc_score(group['y'], group['y_pred'])
            user_roc_aucs.append(roc_auc)

    # Усредняем ROC AUC по пользователям
    average_roc_auc = sum(user_roc_aucs) / len(user_roc_aucs)

    return average_roc_auc

In [12]:
user_roc_auc = calculate_user_roc_auc(user_id=X_train['user_id'], y=y_train, y_pred=y_pred)

In [13]:
user_roc_auc

0.5906197812163069


### Balanced class weights

In [14]:
gbm = CatBoostClassifier(random_seed=42, iterations=30, early_stopping_rounds=10, thread_count=-1, auto_class_weights='Balanced', custom_metric=['F1', 'Precision', 'Recall', 'AUC', 'Accuracy', 'Logloss'],)

In [15]:
gbm.fit(train_pool, eval_set=test_pool, verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.0522307	test: 0.0518830	best: 0.0518830 (0)	total: 5.52s	remaining: 2m 39s
1:	learn: 0.0238666	test: 0.0202187	best: 0.0202187 (1)	total: 10.5s	remaining: 2m 27s
2:	learn: 0.0195673	test: 0.0158513	best: 0.0158513 (2)	total: 15.4s	remaining: 2m 18s
3:	learn: 0.0184049	test: 0.0144976	best: 0.0144976 (3)	total: 20.1s	remaining: 2m 10s
4:	learn: 0.0183071	test: 0.0144309	best: 0.0144309 (4)	total: 25s	remaining: 2m 4s
5:	learn: 0.0182761	test: 0.0144144	best: 0.0144144 (5)	total: 29.9s	remaining: 1m 59s
6:	learn: 0.0182760	test: 0.0144144	best: 0.0144144 (6)	total: 34.4s	remaining: 1m 52s
7:	learn: 0.0182755	test: 0.0144141	best: 0.0144141 (7)	total: 38.7s	remaining: 1m 46s
8:	learn: 0.0182753	test: 0.0144140	best: 0.0144140 (8)	total: 43.3s	remaining: 1m 40s
9:	learn: 0.0182006	test: 0.0143247	best: 0.0143247 (9)	total: 47.6s	remaining: 1m 35s
10:	learn: 0.0181967	test: 0.0143245	best: 0.0143245 (10)	total: 52.1s	remaining: 1m 29s
11:	learn: 0.018196

<catboost.core.CatBoostClassifier at 0x7feb67962f20>

In [16]:
gbm.save_model('../models/gbm_balanced.cbm', format="cbm")

In [17]:
y_pred = gbm.predict(X_train)

In [18]:
user_roc_auc_balanced = calculate_user_roc_auc(user_id=X_train['user_id'], y=y_train, y_pred=y_pred)

In [19]:
user_roc_auc_balanced

0.6851127782468215


### SqrtBalanced class weights

In [5]:
gbm = CatBoostClassifier(random_seed=42, iterations=30, early_stopping_rounds=10, thread_count=-1, auto_class_weights='SqrtBalanced', custom_metric=['F1', 'Precision', 'Recall', 'AUC', 'Accuracy', 'Logloss'],)

In [6]:
gbm.fit(train_pool, eval_set=test_pool, verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.0271151	test: 0.0255264	best: 0.0255264 (0)	total: 5.28s	remaining: 2m 33s
1:	learn: 0.0188931	test: 0.0160698	best: 0.0160698 (1)	total: 9.78s	remaining: 2m 16s
2:	learn: 0.0174347	test: 0.0146187	best: 0.0146187 (2)	total: 14.4s	remaining: 2m 10s
3:	learn: 0.0173366	test: 0.0145644	best: 0.0145644 (3)	total: 19.1s	remaining: 2m 4s
4:	learn: 0.0172890	test: 0.0145370	best: 0.0145370 (4)	total: 23.6s	remaining: 1m 57s
5:	learn: 0.0172711	test: 0.0145337	best: 0.0145337 (5)	total: 28s	remaining: 1m 52s
6:	learn: 0.0172652	test: 0.0145329	best: 0.0145329 (6)	total: 32.6s	remaining: 1m 47s
7:	learn: 0.0172596	test: 0.0145353	best: 0.0145329 (6)	total: 37.3s	remaining: 1m 42s
8:	learn: 0.0172516	test: 0.0145290	best: 0.0145290 (8)	total: 42.1s	remaining: 1m 38s
9:	learn: 0.0172487	test: 0.0145270	best: 0.0145270 (9)	total: 47.1s	remaining: 1m 34s
10:	learn: 0.0172363	test: 0.0145245	best: 0.0145245 (10)	total: 51.8s	remaining: 1m 29s
11:	learn: 0.017216

<catboost.core.CatBoostClassifier at 0x702baf52ceb0>

In [7]:
gbm.save_model('../models/gbm_sqrtbalanced.cbm', format="cbm")

In [8]:
y_pred = gbm.predict(X_train)

In [10]:
user_roc_auc_sqrtbalanced = calculate_user_roc_auc(user_id=X_train['user_id'], y=y_train, y_pred=y_pred)

In [14]:
user_roc_auc_sqrtbalanced

0.6845151901484958
