In [9]:
import sys
import numpy as np
import pandas as pd
import lightgbm as lg
from tqdm import tqdm
import pandas as pd
from pathlib import Path

# from importlib import reload


import catboost
from catboost.utils import get_gpu_device_count

sys.path.append("../../scorer/")
import orderbook_fast as ob

# from  solution import get_simple_features_from_orderbook, get_simple_deals_features
# from  solution import get_simple_deals_features

from my_orderbook import MyOrderBook
catboost_myob = MyOrderBook()

SIDE_BID = 0 
SIDE_ASK = 1

print(f"curdir: {Path.cwd()}")

curdir: /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/wunder_summer/wunder_challenge/examples/catboost_local


## Собираем датасет для тренировки модели

In [10]:
def collect_dataset(data_path):
    '''
        Собираем датасет
    '''

    global catboost_myob
    event_player = ob.EventPlayer(data_path)
    orderbook = ob.OrderBook()

    X = []
    Y = []

    # last_deal = [None, None]
    for ev in tqdm(event_player.iget_events(),
                    total=len(event_player),
                    desc="collecting dataset"):

        if ev.action == ob.Action.DEAL:
            # last_deal[ev.side] = ev
            catboost_myob.set_last_deal(ev)
        elif ev.action == ob.Action.NEW_CHUNK:
            catboost_myob.clear()
            # last_deal = [None, None]

        orderbook.apply_event(ev)
        if ev.need_prediction:
            features = catboost_myob.get_features(ev, orderbook)

            X.append(features)
            Y.append(ev.Y)

    print(f"Dataset collected: len(X) = {len(X)}")
#     return np.array(X), np.array(Y)
    return pd.DataFrame(X), pd.DataFrame(Y)

X_train, Y_train = collect_dataset("../../data/very_small_A.npz")
X_test, Y_test = collect_dataset("../../data/very_small_B.npz")


print("X_train.shape  == ", X_train.shape)

collecting dataset: 100%|██████████| 100000/100000 [00:37<00:00, 2655.29it/s]
collecting dataset: 100%|██████████| 100000/100000 [00:39<00:00, 2502.24it/s]


Dataset collected: len(X) = 2300
Dataset collected: len(X) = 2444
X_train.shape  ==  (2300, 82)


## Обучаем модель градиентного бустинга

In [11]:
%%time
def train_catboost(X_train, Y_train, X_test, Y_test):

    task_type = "CPU" #  if (get_gpu_device_count() == 0) else "GPU"
    langevin = False if task_type == "GPU" else True
    print(f"catboost training with {task_type}...")
    print("X_train.shape = ", X_train.shape)
    print("get_gpu_device_count() = ", get_gpu_device_count())

    train_pool = catboost.Pool(X_train, Y_train, cat_features=catboost_myob.cat_features)     #    , cat_features=cat_features)

    grid = {'learning_rate': [0.03],
            # 'score_function': ["Cosine", "L2", "NewtonL2"],
            'depth': [4, 5, 6],
            'l2_leaf_reg': [2, 3, 5, 8, 11],
            # 'nan_mode': ["Min", "Max"],
            # 'fold_len_multiplier': np.linspace(1.1, 3, 3),
            # 'bagging_temperature': np.linspace(1, 3000, 3),

    }

    model = catboost.CatBoostClassifier(

        bootstrap_type="Bayesian",
        # scale_pos_weight=en.scale_pos_weight,
        # scale_pos_weight=1,
        loss_function = "Logloss",
        eval_metric="AUC:hints=skip_train~false",
        langevin=langevin,
        custom_metric=["Recall", "Precision", "Accuracy", "F1", "Kappa", "MCC"],
        # use_best_model=True,
        iterations=5,
        od_type="Iter",
        od_wait=30,
        # rsm=0.5,
        # random_seed=100,
        task_type=task_type,
        boosting_type='Ordered',
        logging_level="Verbose",
        train_dir="grid",

    )

    grid_search_result = model.grid_search(
        grid,
        X=train_pool,
        stratified=True,
        cv=3,
        search_by_train_test_split=False,
        plot=False      # PLOT,

    )


    params = model.get_params()
    for param in params.items():
        print(param)

    return grid_search_result, model

grid_search_result, clf = train_catboost(X_train, Y_train, X_test, Y_test)
# clf = train_classifier(X_train, Y_train, X_test, Y_test)

catboost training with CPU...
X_train.shape =  (2300, 82)
get_gpu_device_count() =  1
0:	loss: 0.5356971	best: 0.5356971 (0)	total: 4.09s	remaining: 57.2s
1:	loss: 0.5357125	best: 0.5357125 (1)	total: 7.82s	remaining: 50.8s
2:	loss: 0.5322337	best: 0.5357125 (1)	total: 11.5s	remaining: 45.9s
3:	loss: 0.5244432	best: 0.5357125 (1)	total: 15.2s	remaining: 41.8s
4:	loss: 0.5386835	best: 0.5386835 (4)	total: 18.8s	remaining: 37.7s
5:	loss: 0.5484625	best: 0.5484625 (5)	total: 22.8s	remaining: 34.1s
6:	loss: 0.5450162	best: 0.5484625 (5)	total: 26.6s	remaining: 30.4s
7:	loss: 0.5421204	best: 0.5484625 (5)	total: 30.1s	remaining: 26.3s
8:	loss: 0.5489514	best: 0.5489514 (8)	total: 33.6s	remaining: 22.4s
9:	loss: 0.5458614	best: 0.5489514 (8)	total: 37.4s	remaining: 18.7s
10:	loss: 0.5752444	best: 0.5752444 (10)	total: 41.4s	remaining: 15s
11:	loss: 0.5784530	best: 0.5784530 (11)	total: 45.4s	remaining: 11.4s
12:	loss: 0.5820167	best: 0.5820167 (12)	total: 49.3s	remaining: 7.59s
13:	loss: 0.5

## Тестируем получившийся классификатор

In [12]:
def process_event_and_predict_proba(ev, orderbook):

    if ev.action == ob.Action.DEAL:
        catboost_myob.set_last_deal(ev)
    elif ev.action == ob.Action.NEW_CHUNK:
        catboost_myob.clear()



    if not ev.need_prediction:
        return None
    
    features = catboost_myob.get_features(ev, orderbook)
    proba = clf.predict_proba([features])[0, 1]
    return proba

from scorer import Scorer

# scoring = Scorer("../../data/train_small_C.npz")
scoring = Scorer("../../data/very_small_B.npz")
roc_auc, (true_ys, pred_probas) = scoring.score(process_event_and_predict_proba)

scoring: 100%|██████████| 100000/100000 [00:42<00:00, 2369.21it/s]



roc_auc_score = 0.560


## Сохраним модель, и решение для отправки готово

In [13]:
# Сохраним нашу модель
# clf.booster_.save_model("wunder.model")
clf.save_model("wunder.model", format="cbm")
print(f"curdir: {Path.cwd()}")
'''Посмотрите код файла solution.py. 
Он использует те же функции что и этот ноутбук, но уже готов к отправке на серверю. 
Попробуйте создать архив с файлами solution.py и wunder.model и отправить их на проверку.'''

curdir: /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/wunder_summer/wunder_challenge/examples/catboost_local


'Посмотрите код файла solution.py. \nОн использует те же функции что и этот ноутбук, но уже готов к отправке на серверю. \nПопробуйте создать архив с файлами solution.py и wunder.model и отправить их на проверку.'