In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [2]:
def get_xy(start,finish):
    
    X_train = pd.DataFrame()
    y_train = np.empty((0,), dtype = np.int8)

    for i in tqdm(range(start,finish)):
        X = pd.read_parquet('../input/riiid-public-prepare-xy/X_'+str(i)).iloc[:,:-2]
        y_train = np.concatenate([y_train, np.load('../input/riiid-public-prepare-xy/y_'+str(i)+'.npy')])
        
        X_train = pd.concat([X_train, X])
        del(X)

    return X_train, y_train

In [3]:
train = Pool(*get_xy(0,5))

100%|██████████| 5/5 [01:08<00:00, 13.63s/it]


In [4]:
model1 = CatBoostClassifier(random_state = 0,
                           auto_class_weights = 'Balanced',
                           #eval_metric = 'AUC:hints=skip_train~false',
                           task_type = 'GPU',
                           per_float_feature_quantization=['3:border_count=1000'],
                           iterations = 4000,
                           learning_rate = 0.7
                          )

In [5]:
%%time

model1.fit(
    train,
    #use_best_model = True,
    #eval_set = val,
    silent = True,
    plot = True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 27min 5s, sys: 4min 48s, total: 31min 54s
Wall time: 22min 42s


<catboost.core.CatBoostClassifier at 0x7fe5b6d043d0>

In [6]:
%%time
del train
X,y = get_xy(5,10)
auc1 = roc_auc_score(y, model1.predict_proba(X)[:,1])
del X,y
print(auc1)
model1.save_model('cb1')
del model1

100%|██████████| 5/5 [01:07<00:00, 13.57s/it]


0.7877195447672836
CPU times: user 6min 14s, sys: 28.8 s, total: 6min 43s
Wall time: 4min 38s


In [7]:
train = Pool(*get_xy(5,10))

100%|██████████| 5/5 [01:03<00:00, 12.63s/it]


In [8]:
model2 = CatBoostClassifier(random_state = 0,
                           auto_class_weights = 'Balanced',
                           #eval_metric = 'AUC:hints=skip_train~false',
                           task_type = 'GPU',
                           per_float_feature_quantization=['3:border_count=1000'],
                           iterations = 4000,
                           learning_rate = 0.7
                          )

In [9]:
%%time

model2.fit(
    train,
    #use_best_model = True,
    #eval_set = val,
    silent = True,
    plot = True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 25min 46s, sys: 4min 45s, total: 30min 32s
Wall time: 21min 19s


<catboost.core.CatBoostClassifier at 0x7fe5be4372d0>

In [10]:
%%time
del train
X,y = get_xy(0,5)
auc2 = roc_auc_score(y, model2.predict_proba(X)[:,1])
del X,y
print(auc2)
model2.save_model('cb2')
del(model2)

100%|██████████| 5/5 [01:09<00:00, 13.86s/it]


0.7883013366932489
CPU times: user 6min 22s, sys: 28 s, total: 6min 50s
Wall time: 4min 43s


In [11]:
print((auc1+auc2)/2)

0.7880104407302662
