In [2]:
import sys
import numpy as np
import pandas as pd
import lightgbm as lg
from tqdm import tqdm

import plotly.graph_objs as go
import chart_studio.plotly as iplt

# Cufflinks wrapper on plotly
import cufflinks
from ipywidgets import IntProgress
from IPython.display import display

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
cufflinks.go_offline()

pd.set_option("display.max_rows", 150)


sys.path.append("../../scorer/")
# import orderbook as ob
# Чтобы использовать быстрый ордербук раскомментируйте строку:
import orderbook_fast as ob



SIDE_BID = 0 
SIDE_ASK = 1

PRINT_EVENTS = 0
OB_EVENTS_TOTAL = 0
OB_LAST_TRY = 0
SNAPSHOTS = 0
OB_EVENTS_MEAN = 0

## Собираем датасет для тренировки модели

In [3]:
%%time
def get_simple_features_from_orderbook(orderbook, max_index=2 ):
    '''
        Считаем простые фичи по ордербуку:
    '''

    global PRINT_EVENTS
    global OB_EVENTS_TOTAL
    global OB_LAST_TRY
    global SNAPSHOTS
    global OB_EVENTS_MEAN

    PRINT_EVENTS += 1

    spread = orderbook.get_best_price(SIDE_ASK) - orderbook.get_best_price(SIDE_BID)
    features = [spread]
    for side in (SIDE_BID, SIDE_ASK):
        for ix in range(max_index):
            price_level = orderbook.get_price_level_at_ix(side, ix)
            if price_level is None:
                features += [-1, -1]
            else:
                features += [price_level.get_volume(), 
                             price_level.get_num_orders()]
    return features


def get_simple_deals_features(last_deals, orderbook):
    '''
        Считаем простые фичи по последним сделкам:
    '''
    cur_mean_price = orderbook.get_mean_price()
    cur_time = orderbook.get_time()

    features = []
    for side in (SIDE_BID, SIDE_ASK):
        deal_event = last_deals[side]
        if deal_event is None:
            features += [-1e9, -1e9, -1e9]
        else:
            features += [cur_mean_price - deal_event.price, 
                         cur_time - deal_event.time, 
                         deal_event.amount]
    return features


def collect_dataset(data_path):
    '''
        Собираем датасет
    '''
    event_player = ob.EventPlayer(data_path)
    orderbook = ob.OrderBook()

    X = []
    Y = []

    # Словарь со всеми типами action {action_type: occurrences}
    counter_dict = {}

    counter = 0
    deal_counter = 0
    newchunk_counter = 0
    last_deals = [None, None]

    for ev in tqdm(event_player.iget_events(),
                    total=len(event_player), 
                    desc="collecting dataset"):
        if ev.action == ob.Action.DEAL:
            deal_counter+=1
            last_deals[ev.side] = ev
        elif ev.action == ob.Action.NEW_CHUNK:
            newchunk_counter+=1
            last_deals = [None, None]

        counter+=1
        if ev.action in counter_dict:
            counter_dict[ev.action]+=1
        else:
            counter_dict[ev.action]=1

        orderbook.apply_event(ev)
        if ev.need_prediction:
            features = get_simple_features_from_orderbook(orderbook)
            features += get_simple_deals_features(last_deals, orderbook)

            X.append(features)
            Y.append(ev.Y)

    print(f"Dataset collected: len(X) = {len(X)}")


    print("******counter*******")
    print("ob.Action.NEW_CHUNK ", ob.Action.NEW_CHUNK)
    print("counter", counter)
    print("newchunk_counter", newchunk_counter)
    print("deal_counter", deal_counter)
    print("PRINT_EVENTS ", PRINT_EVENTS)
    print("OB_EVENTS_TOTAL", OB_EVENTS_TOTAL)
    print("OB_LAST_TRY", OB_LAST_TRY)


    print(counter_dict)
    print("******counter finished*******")
    return np.array(X), np.array(Y)


X_train, Y_train = collect_dataset("../../data/train_small_A.npz")
# X_test, Y_test = collect_dataset("../../data/train_small_B.npz")

collecting dataset: 100%|██████████| 10617618/10617618 [15:17<00:00, 11575.71it/s]


Dataset collected: len(X) = 234905
******counter*******
ob.Action.NEW_CHUNK  10
counter 10617618
newchunk_counter 161
deal_counter 663089
PRINT_EVENTS  234905
OB_EVENTS_TOTAL 0
OB_LAST_TRY 0
{10: 161, 1: 5228555, 2: 144862, 0: 4580951, 3: 663089}
******counter finished*******
CPU times: user 15min 18s, sys: 4.04 s, total: 15min 22s
Wall time: 15min 20s


In [None]:
# print(X_train.shape)

In [None]:
train = np.load("../../data/train_small_A.npz")
train_df = pd.DataFrame(data=train["events"], columns=train["columns"])
print(train_df.shape)
print(train_df.head())

In [None]:
print(len(train_df.id.unique()))
print(train_df.action.value_counts())
print(sum(train_df.action.value_counts()))

print("train_df.time.is_monotonic", train_df.time.is_monotonic)
print("train_df.id.is_monotonic", train_df.id.is_monotonic)

action10 = train_df.loc[train_df.action == 10]
print(action10)


In [None]:
chunk_sizes = pd.Series(action10.index[1:] - action10.index[:-1])
print(chunk_sizes)

chunk_sizes.iplot(kind="hist", bins=30)

In [None]:
train_df.value_counts(["is_snapshot", "Y"])

train_df.is_snapshot.value_counts()

print("train_df.Y.value_counts()", train_df.Y.value_counts())

In [None]:
print(" action.*********vs**************.Y")
print(train_df.value_counts(subset=["action", "Y"]))

## Обучаем модель градиентного бустинга

In [None]:
def train_classifier(X_train, Y_train, X_test, Y_test):
    '''
        Обучаем классификатор
    '''
    clf = lg.LGBMClassifier(num_leaves=31, n_estimators=1000, learning_rate=0.1)
    clf.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], 
            eval_metric="auc", early_stopping_rounds=20)
    return clf
 
clf = train_classifier(X_train, Y_train, X_test, Y_test)

## Тестируем получившийся классификатор

In [None]:
last_deals = [None, None]

def process_event_and_predict_proba(event, orderbook):
    if event.action == ob.Action.DEAL:
        last_deals[event.side] = event
    elif event.action == ob.Action.NEW_CHUNK:
        last_deals[:] = [None, None]
        
    if not event.need_prediction:
        return None
    
    features = get_simple_features_from_orderbook(orderbook)
    features += get_simple_deals_features(last_deals, orderbook)    
    proba = clf.predict_proba([features])[0, 1]
    return proba

from scorer import Scorer

scoring = Scorer("../../data/train_small_C.npz")
roc_auc, (true_ys, pred_probas) = scoring.score(process_event_and_predict_proba)

## Сохраним модель, и решение для отправки готово

In [None]:
# Сохраним нашу модель
clf.booster_.save_model("wunder.model")

'''Посмотрите код файла solution.py. 
Он использует те же функции что и этот ноутбук, но уже готов к отправке на серверю. 
Попробуйте создать архив с файлами solution.py и wunder.model и отправить их на проверку.'''