In [2]:
import numpy as np
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
from collections import deque
import enum
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import time


import random
from scipy.special import softmax
from sklearn.datasets import make_blobs
from sklearn.metrics import log_loss



import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
from sklearn.metrics import accuracy_score



def metric(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    weight_matrix = np.array([
                [1.9, -0.3, -2],
                [-0.3,    0, -0.3],
                [-2, -0.3, 1.9]
    ])
    hit_matrix = conf_matrix * weight_matrix
    hit_matrix_sum = np.sum(hit_matrix)
    action_count = np.sum(conf_matrix[0]) + np.sum(conf_matrix[2])
    
    result = hit_matrix_sum / np.sqrt(action_count)
    return result




# 2. Грузим df фичей

In [3]:
def select_every_10th_row(df):
    selected_rows = df.iloc[::10]  
    return selected_rows

In [4]:
merged_data = pd.read_csv("/Users/rs/Documents/HFT project/merged_data.csv")

# 3. Обучим модель

In [5]:
def multiclass_custom_objective(y_pred, ds):
    y_true = ds.get_label()
    y_pred = y_pred.reshape((y_true.size, -1), order='F')
    
    
    num_rows, num_class = y_pred.shape
    prob = softmax(y_pred, axis=1)

    grad_update = np.zeros_like(prob)
    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0

    
    #display('y_pred_prob: ', prob)    
    #display('y_true: ', y_true)
    #display('grad_update: ', grad_update)


    grad = prob + grad_update
    factor = num_class / (num_class - 1)
    
    hess = factor * prob * (1 - prob)
    grad = grad.ravel(order='F')
    hess = hess.ravel(order='F')
    
    return grad, hess

def multiclass_metric(y_pred, ds):
    y_true = ds.get_label()
    y_pred = y_pred.reshape((y_true.size, -1), order='F')

    prob = softmax(y_pred, axis=1)
    #pred_list = [2 if row[2] > 0.6 else 0 if row[0] > 0.6 else 1 for row in prob]
    pred_list = np.argmax(prob, axis=1)
    return 'mt-c', metric(y_true, pred_list), False
    
def accuracy(y_pred, ds):
    y_true = ds.get_label()
    y_pred = y_pred.reshape((y_true.size, -1), order='F')

    prob = softmax(y_pred, axis=1)
    pred_list = np.argmax(prob, axis=1)
    #pred_list = [2 if row[2] > 0.6 else 0 if row[0] > 0.6 else 1 for row in prob]

    return 'ac-cy', accuracy_score(y_true, pred_list), True



In [6]:
def pred_proc_df(df):
    
    X = df.drop(['target', 'local_ts'], axis=1)

    if np.isinf(X).any().any():
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        X.fillna(0, inplace=True)

    y = df['target']
    y += 1
    
    return X, y

In [7]:
def train_dataset_log_loss(df_orig, df_orig_val,  n_estimators = 15, num_leaves = 6, num_tree_per_iteration = 3):
    df = df_orig.copy()
    df_val = df_orig_val.copy()
    
    
    X, y = pred_proc_df(df)
    X_val, y_val = pred_proc_df(df_val)

    train_dataset = lgb.Dataset(X, y)
    val_dataset = lgb.Dataset(X_val, y_val)

    print('num_tree_per_iteration ', num_tree_per_iteration)
    params = {
                'objective': 'multiclass',
                'num_iterations': num_tree_per_iteration,
                'n_estimators': n_estimators,
                'num_leaves': num_leaves,
                'num_class': 3,
                'verbose': -1
            }
    lgb_model = lgb.train(params=params,
                      train_set=train_dataset,
                      valid_sets=(train_dataset, val_dataset),
                      valid_names=('train', 'val'),
                      fobj=multiclass_custom_objective,
                      feval=[multiclass_metric, accuracy],
                      callbacks=[lgb.log_evaluation(1)])
    
    lgb_model.save_model('lgb_model_log_loss.txt')

    return lgb_model

In [8]:
train_df = merged_data[:-200000]
val_df = merged_data.tail(200000)

In [22]:
model_2500_all_data = train_dataset_log_loss(train_df, val_df, n_estimators = 2500, num_leaves = 12)

num_tree_per_iteration  3
[1]	train's mt-c: -150.914	train's ac-cy: 0.607461	val's mt-c: -65.7033	val's ac-cy: 0.627465
[2]	train's mt-c: -145.954	train's ac-cy: 0.608947	val's mt-c: -65.9265	val's ac-cy: 0.62745
[3]	train's mt-c: -135.521	train's ac-cy: 0.609759	val's mt-c: -67.9881	val's ac-cy: 0.62642
[4]	train's mt-c: -114.628	train's ac-cy: 0.611103	val's mt-c: -62.2112	val's ac-cy: 0.625485
[5]	train's mt-c: -117.443	train's ac-cy: 0.611666	val's mt-c: -63.5065	val's ac-cy: 0.62642
[6]	train's mt-c: -113.997	train's ac-cy: 0.611742	val's mt-c: -61.9777	val's ac-cy: 0.62692
[7]	train's mt-c: -108.218	train's ac-cy: 0.612612	val's mt-c: -62.9252	val's ac-cy: 0.62683
[8]	train's mt-c: -98.3119	train's ac-cy: 0.613722	val's mt-c: -64.6507	val's ac-cy: 0.62647
[9]	train's mt-c: -92.6935	train's ac-cy: 0.614135	val's mt-c: -62.4597	val's ac-cy: 0.627605
[10]	train's mt-c: -88.959	train's ac-cy: 0.614929	val's mt-c: -63.3804	val's ac-cy: 0.628165
[11]	train's mt-c: -82.8646	train's ac-c

In [23]:
model_2500_all_data.save_model('model_2500_all_data.txt')

<lightgbm.basic.Booster at 0x7ff61a376a90>

In [9]:
model_2500 = lgb.Booster(model_file='model_2500_all_data.txt')

# Дообучение

In [22]:
def after_train_dataset_log_loss(df_orig, df_orig_val, num_iterations = 10, n_estimators = 12, init_model_name = 'model_2500_all_data.txt'):
    df = df_orig.copy()
    df_val = df_orig_val.copy()
    
    X, y = pred_proc_df(df)
    X_val, y_val = pred_proc_df(df_val)
    
    train_dataset = lgb.Dataset(X, y)
    val_dataset = lgb.Dataset(X_val, y_val)

    loaded_model = lgb.Booster(model_file=init_model_name)


    params = {
                'objective': 'multiclass',
                'num_iterations': num_iterations,
                'n_estimators': n_estimators,
                'num_leaves': 12,
                'num_class': 3,
                'verbose': -1
            }
    lgb_model_log_loss_after_trained = lgb.train(params=params,
                      train_set=train_dataset,
                      valid_sets=(train_dataset, val_dataset),
                      valid_names=('train', 'val'),
                      fobj=multiclass_custom_objective,
                      feval=[multiclass_metric, accuracy],
                      callbacks=[lgb.log_evaluation(1)],
                      init_model=loaded_model)

    lgb_model_log_loss_after_trained.save_model('lgb_model_log_loss_after_trained.txt')

    return lgb_model_log_loss_after_trained

In [23]:
lgb_model_log_loss_after_trained = after_train_dataset_log_loss(train_df, val_df,\
                                                                num_iterations = 7 , n_estimators = 12)

[2501]	train's mt-c: 752.27	train's ac-cy: 0.723849	val's mt-c: -48.5658	val's ac-cy: 0.62618
[2502]	train's mt-c: 752.354	train's ac-cy: 0.723859	val's mt-c: -48.4779	val's ac-cy: 0.62623
[2503]	train's mt-c: 752.436	train's ac-cy: 0.723868	val's mt-c: -48.4592	val's ac-cy: 0.62623
[2504]	train's mt-c: 752.531	train's ac-cy: 0.72389	val's mt-c: -48.4658	val's ac-cy: 0.626225
[2505]	train's mt-c: 752.667	train's ac-cy: 0.723913	val's mt-c: -48.5823	val's ac-cy: 0.626205
[2506]	train's mt-c: 752.777	train's ac-cy: 0.723935	val's mt-c: -48.5739	val's ac-cy: 0.62618
[2507]	train's mt-c: 752.946	train's ac-cy: 0.723962	val's mt-c: -48.6173	val's ac-cy: 0.62617
