In [1]:
from FinancialMachineLearning.features.fracdiff import FractionalDifferentiatedFeatures
import pandas as pd
import numpy as np
import lightgbm as lgb

import ta
from ta.volatility import BollingerBands
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

import warnings
warnings.filterwarnings('ignore')

In [13]:
class Preprocessor : 
    """
    This object helps to preprocess data for the model input.
    Input test data to this object and run model.
    """
    
    def __init__(self) :
        self.data = None
        
    def __add_fd_features(self) : 
        self.data['Close_FD'] = FractionalDifferentiatedFeatures.fracDiff_FFD(self.data[['Close']], 0.25)
        self.data['Volume_FD'] = FractionalDifferentiatedFeatures.fracDiff_FFD(self.data[['Volume']], 0.35)
        
    def __add_ta_features(self) : 
        ta.add_all_ta_features(
            self.data, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True
        )
        indicator_bb = BollingerBands(close = self.data["Close"], window = 20, window_dev = 1)

        self.data['volatility_bbh'] = indicator_bb.bollinger_hband()
        self.data['volatility_bbl'] = indicator_bb.bollinger_lband()
        self.data['volatility_bbhi'] = indicator_bb.bollinger_hband_indicator()
        self.data['volatility_bbli'] = indicator_bb.bollinger_lband_indicator()
        
    def __add_labels(self) : 
        self.data['label'] = 1 * self.data['volatility_bbhi']- 1 * self.data['volatility_bbli']
        self.data['meta_label'] = self.data['label'].apply(abs)
        
        self.data = self.data.apply(lambda x : x.astype('category') if x.nunique() <= 4 else x)
    
        to_drop = ['label', 'Open','High','Low','Close','Volume'] + [col for col in self.data.columns if 'volatility_bb' in col]
        
        self.X = self.data.drop(columns = to_drop).astype('float64')
        self.y = self.X.pop('meta_label').to_frame().astype('int64')
        
    def __add_pd_features(self) : 
        self.X = pd.concat([self.X] + [self.X.diff(i).add_prefix('∆').add_suffix(f'({i})') for i in range(1, 4)], axis = 1)
        
    def __add_dt_features(self) :
        self.X['month'] = self.X.index.to_series().dt.month.astype('category')
        self.X['day'] = self.X.index.to_series().dt.day.astype('category')
        self.X['dayofweek'] = self.X.index.to_series().dt.dayofweek.astype('category')
    
    def __detect_num_cat_cols(self) : 
        self.numeric_columns=list(self.X.select_dtypes(['float64', 'int64']).columns)
        self.categorical_columns=list(self.X.select_dtypes('category').columns)
        
    
    def __minmax_scale(self) :        
        self.scaler = MinMaxScaler()
        X_num = self.X[self.numeric_columns]
        X_cat = self.X[self.categorical_columns]
        X_num_scaled = pd.DataFrame(self.scaler.fit_transform(X_num), index = X_num.index, columns = X_num.columns)
        
        self.X_scaled = pd.concat([X_num_scaled, X_cat], axis = 1)
        self.X_scaled = self.X_scaled.apply(lambda x : x.astype('int64') if x.dtype == 'category' else x)
        
    def fit(self, data) :
        print("=" * 60)
        print("### Start to preprocess data ...")
        print("=" * 60)
        self.data = data
        print('### Add fractionally differentiated features ...')
        self.__add_fd_features()
        print('### Add ta features ...')
        self.__add_ta_features()
        print('### Add labels ...')
        self.__add_labels() 
        print('### Add partially differentiated features ...')
        print(self.X)
        self.__add_pd_features()
        print('### Add datetime features ...')
        self.__add_dt_features()
        print('### Conduct scaling ...')
        self.__detect_num_cat_cols()
        self.__minmax_scale()
        print("=" * 60)
        print('### Finished ...')
        print("=" * 60)

다음과 같은 방식으로 사용.

In [14]:
daily_train = pd.read_csv('../Data/daily_train.csv', parse_dates = ['Date']).set_index('Date')
daily_train

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1962-01-02,0.000000,1.589844,1.578125,1.578125,902400
1962-01-03,0.000000,1.601563,1.578125,1.601563,1200000
1962-01-04,0.000000,1.613281,1.597656,1.605469,1088000
1962-01-05,0.000000,1.613281,1.566406,1.570313,1222400
1962-01-08,0.000000,1.582031,1.546875,1.566406,1388800
...,...,...,...,...,...
2019-12-24,70.349998,70.500000,69.910004,70.019997,3979400
2019-12-26,70.190002,70.500000,70.010002,70.129997,8840200
2019-12-27,70.199997,70.309998,69.879997,69.889999,10516100
2019-12-30,70.089996,70.440002,69.400002,69.480003,12689400


In [16]:
preprocessor = Preprocessor()
preprocessor.fit(daily_train)

### Start to preprocess data ...
### Add fractionally differentiated features ...
### Add ta features ...
### Add labels ...
### Add partially differentiated features ...
            Close_FD     Volume_FD    volume_adi    volume_obv  volume_cmf  \
Date                                                                         
1962-01-02       NaN           NaN -9.024000e+05  9.024000e+05   -1.000000   
1962-01-03       NaN           NaN  2.976000e+05  2.102400e+06    0.141553   
1962-01-04       NaN           NaN  2.976664e+05  3.190400e+06    0.093301   
1962-01-05       NaN           NaN -7.209630e+05  1.968000e+06   -0.163380   
1962-01-08       NaN           NaN -5.666602e+05  5.792000e+05   -0.097673   
...              ...           ...           ...           ...         ...   
2019-12-24  6.523990 -1.117368e+07  5.268262e+09  5.121628e+09    0.003027   
2019-12-26  6.650134 -1.714870e+06  5.263751e+09  5.130468e+09   -0.013851   
2019-12-27  6.376940 -1.679250e+05  5.253725e+09 

In [17]:
preprocessor.X_scaled.head()

Unnamed: 0_level_0,Close_FD,Volume_FD,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,volume_sma_em,volume_vpt,volume_vwap,...,∆momentum_pvo(3),∆momentum_pvo_signal(3),∆momentum_pvo_hist(3),∆momentum_kama(3),∆others_dr(3),∆others_dlr(3),∆others_cr(3),month,day,dayofweek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1962-01-02,,,0.009107,0.000336,0.0,0.696778,0.463405,0.568667,0.008649,0.000386,...,,,,,,,,1,2,1
1962-01-03,,,0.009328,0.000518,0.704782,0.696891,0.463459,0.569162,0.009085,0.000452,...,,,,,,,,1,3,2
1962-01-04,,,0.009328,0.000683,0.674992,0.696877,0.463511,0.5694,0.00915,0.000508,...,,,,,,,,1,4,3
1962-01-05,,,0.00914,0.000498,0.51652,0.696838,0.463124,0.568292,0.008495,0.000478,...,0.373784,0.393633,0.479478,0.644368,0.394435,0.417678,0.520007,1,5,4
1962-01-08,,,0.009169,0.000287,0.557087,0.696826,0.463104,0.567692,0.00841,0.000416,...,0.373002,0.413489,0.466776,0.642512,0.404907,0.427762,0.518969,1,8,0


# Load Dataset

다음의 코드의 `test data` 자리에 넣고자 하는 test data를 넣어주시면 바로 돌아갑니다.


In [18]:
test_data = pd.read_csv('../Data/daily_test.csv', parse_dates = ['Date']).set_index('Date')
test_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,70.239998,71.019997,70.239998,70.900002,12456400
2020-01-03,71.339996,71.370003,70.160004,70.330002,17386900
2020-01-06,70.320000,71.360001,70.230003,70.870003,20081900
2020-01-07,70.500000,70.519997,69.510002,70.290001,17387700
2020-01-08,70.110001,70.290001,69.169998,69.230003,15137700
...,...,...,...,...,...
2023-12-22,102.309998,102.940002,101.820000,101.910004,12921800
2023-12-26,102.739998,103.029999,102.120003,102.139999,16835100
2023-12-27,102.040001,102.550003,101.339996,101.660004,14558800
2023-12-28,101.389999,101.610001,100.129997,100.190002,16329300


In [19]:
preprocessor = Preprocessor()
preprocessor.fit(test_data)

X_test, y_test = preprocessor.X_scaled, preprocessor.y
test = lgb.Dataset(X_test, y_test, free_raw_data = False)
test.set_categorical_feature(preprocessor.categorical_columns)

### Start to preprocess data ...
### Add fractionally differentiated features ...
### Add ta features ...
### Add labels ...
### Add partially differentiated features ...
            Close_FD  Volume_FD    volume_adi   volume_obv  volume_cmf  \
Date                                                                     
2020-01-02       NaN        NaN  8.623811e+06   12456400.0    0.692320   
2020-01-03       NaN        NaN -3.877562e+06   -4930500.0   -0.129931   
2020-01-06       NaN        NaN -1.211790e+06   15151400.0   -0.024272   
2020-01-07       NaN        NaN  8.256864e+06   -2236300.0    0.122664   
2020-01-08       NaN        NaN -5.258803e+06  -17374000.0   -0.063781   
...              ...        ...           ...          ...         ...   
2023-12-22       NaN        NaN -2.462627e+08 -186605100.0   -0.051879   
2023-12-26       NaN        NaN -2.623579e+08 -169770000.0   -0.095817   
2023-12-27       NaN        NaN -2.692161e+08 -184328800.0   -0.069785   
2023-12-28     

<lightgbm.basic.Dataset at 0x14fd683a0>

## Load Model and Evaluate

In [20]:
filename = 'lgbm_model.pkl'
clf = joblib.load(filename)

In [21]:
def print_metric(model, data : lgb.Dataset) : 
    y_true = data.label
    X = data.data
    y_pred = (clf.predict(X).reshape(-1,1) >= 0.5) * 1
    print(f'Confusion Matrix \n {confusion_matrix(y_true, y_pred)}')
    print(f"Accuracy score: {accuracy_score(y_true, y_pred) :3%}")
    print(f'Precision score: {precision_score(y_true, y_pred) :3%}')
    print(f'Recall score: {recall_score(y_true, y_pred):3%}')
    print(f"F1 score: {f1_score(y_true, y_pred):3%}")
    print(f"ROC-AUC score: {roc_auc_score(y_true, y_pred) :3%}")

In [22]:
print_metric(clf, test)

Confusion Matrix 
 [[385  78]
 [183 360]]
Accuracy score: 74.055666%
Precision score: 82.191781%
Recall score: 66.298343%
F1 score: 73.394495%
ROC-AUC score: 74.725845%
