In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score 
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score 
import optuna
import numpy as np 
import os
from sklearn import metrics
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
# 要跑 "ask" or "bid" 
STATE = "bid"
# Training and Validation  
# we split data to 10 parts
# TV1 means split data to training 1 part ,validation 9 parts
TV = 1
# 是否找參數
FIND_PARAM =False
# if FIND_PARAM == false , 下面要設定 ,
# 要重跑的model路徑
MODEL_PATH = rf"C:\Users\yicheng\Desktop\model\xgb_best_TV{TV}{STATE}.model"
# 輸出model路徑 ,後面名字一樣就好
OUTPUT_MODEL_PATH ='./xgb_best_bid0.88.model'
# 是否找參數
PRE_FIND_PARAM =False
# 找參數要跑幾次
TRIALS = 100
# 是否要繼續train
PRE_TRAIN = False
# pre_train的次數
PRE_TRAIN_TIMES = 1

In [3]:
# 估計值
def Score(m, x_train, y_train, x_test, y_test, train=True):
    # training 的
    if train:
        pred=m.predict(x_train)
        print('Train Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_train, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_train, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_train, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_train, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)}")
    # testing 的
    elif train == False:
        pred=m.predict(x_test)
        print('Test Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_test, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_test, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_test, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_test, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)}")
        

In [4]:
# 讀data

FILE_PATH = rf"C:\Users\yicheng\Desktop\high-frequency-trading-MCTS\stock_dataset_with_label\2330\{STATE}\*"

files = glob.glob(FILE_PATH)

data=[]
data = pd.DataFrame(data)

for f in files:
    
    d = pd.read_csv(f, index_col=None)
    d = d.drop(columns=["matchPri",'bidPri1','bidPri2','bidPri3','bidPri4','bidPri5','askPri1','askPri2','askPri3','askPri4','askPri5',"openPri"])
    data = pd.concat([data,d]) 
    

print(data.shape)

(527175, 12)


In [5]:
# the length of total data 
length = len(data)
# the length of training data 
train_length = int(length * TV *0.1)
# split training data
train_data = data[:train_length]
# train input data 
X_train = train_data.drop(columns=['label'])
# train label 
y_train = train_data['label'].to_numpy()

# split testing data
test_data = data[train_length:]
# tets input data 
X_test = test_data.drop(columns=['label'])
# test label
y_test = test_data['label'].to_numpy()



In [6]:
print(f"Train : {len(X_train)} {len(y_train)}")
print(f"Test  : {len(X_test)} {len(y_test)}")

Train : 52717 52717
Test  : 474458 474458


In [7]:
# 計算training 的 label 比例分配
change = 0
other = 0
for i in y_train:
    
    if i == 0:
        other+=1
    else:
        change+=1

print(change,other)
print('change: {:%}'.format(change/(change+other)))
print('other : {:%}'.format(other/(change+other)))

8384 44333
change: 15.903788%
other : 84.096212%


In [8]:
best_recall = -1

In [9]:
# run weight 
def Objective(trial):

    global best_recall
    # 配置要選的權重和range
    # 'scale_pos_weight':other/down,這個一定要加 ，他是處理imbalance的 
    param = {
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0,log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50,500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0,log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0,log=True),
        'scale_pos_weight':other/change,
        'reg_alpha':  trial.suggest_float('reg_lambda', 1e-8, 1.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0,log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1.0),
        # CPU thread 
        'nthread':16
    }

    # 訓練
    model = xgb.XGBClassifier(**param)  
    
    model.fit(X_train, y_train,verbose=False)
    # 預測
    X_pred = model.predict(X_train)
    # 評估分數
    recall = round(f1_score(y_train, X_pred),2)
    if recall > best_recall:
        # 存model weight
        model.save_model(f'./xgb_best_{STATE}{recall}.model')

        if os.path.isfile(f'./xgb_best_{STATE}{best_recall}.model'):
            os.remove(f'./xgb_best_{STATE}{best_recall}.model')
        
        best_recall = recall
        
    return recall

In [10]:
if FIND_PARAM:
    # 找參數的套件, 很佔資源
    # direction ="maximize" ,代表作為評估的值要找最大值
    study = optuna.create_study(direction='maximize')

    # pre-train 用
    if PRE_FIND_PARAM:
        model = xgb.XGBClassifier()
        model.load_model(MODEL_PATH)
        study.enqueue_trial(model.get_params())

    # n_trials 要跑幾次
    study.optimize(Objective, n_trials = TRIALS,show_progress_bar=True)
    

In [11]:
if FIND_PARAM:
    # 輸出找到的最佳參數
    print('Number of finished trials: {}'.format(len(study.trials)))
    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))
    print('  Params: ')

    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

In [12]:
# 看一下效果
model = xgb.XGBClassifier()

if FIND_PARAM:
    # 放best model
    model_path = f'./xgb_best_{STATE}{round(study.best_value,2)}.model'
else :
    model_path = MODEL_PATH
    
model.load_model(model_path)
# training eval
Score(model,X_train, y_train, X_test,y_test)
# testing eval
Score(model,X_train, y_train, X_test,y_test,train=False)

print('特徵重要程度: ',model.feature_importances_)

Train Result:

Accuracy Score: 99.67%
Precision Score: 97.98%
Recall Score: 99.99%
F1 score: 98.97%
Confusion Matrix:
 [[44160   173]
 [    1  8383]]
Test Result:

Accuracy Score: 62.52%
Precision Score: 17.47%
Recall Score: 32.49%
F1 score: 22.72%
Confusion Matrix:
 [[270466 123519]
 [ 54325  26148]]
特徵重要程度:  [0.14059387 0.08418418 0.08594345 0.08518973 0.08558591 0.0889026
 0.08229991 0.08374292 0.0867191  0.08747877 0.08935965]


In [13]:
# 這邊就不斷調要train 幾次 ,看recall score 
# **trial.params : 前面找出來的參數

if PRE_TRAIN:
   
    if FIND_PARAM:
         model = xgb.XGBClassifier(**trial.params)
    else :
    #     pre-train用
        model = xgb.XGBClassifier()
        model.load_model(model_path)
    
    loop = tqdm(range(PRE_TRAIN_TIMES))
  
    for i in loop:
        
        if i == 0:
            # 存檔
            model.save_model(OUTPUT_MODEL_PATH)
        
        # xgb_model 讀檔的
        model.fit(X_train, y_train,verbose=1,xgb_model=OUTPUT_MODEL_PATH)
        # 存檔
        model.save_model(OUTPUT_MODEL_PATH)


    Score(model,X_train, y_train, X_test,y_test)
    Score(model,X_train, y_train, X_test,y_test,train=False)