In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.0-py3-none-win_amd64.whl (126.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 
import xgboost as xgb

import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 
# df.reset_index(inplace=True) # df is the full processed data

In [3]:
small_data = df.tail(50000).copy()
small_data.reset_index(inplace=True)

small_data.head()

Unnamed: 0,index,stock_id,time_id,target,book.seconds_in_bucket.count,book.wap1.sum,book.wap1.mean,book.wap1.std,book.wap2.sum,book.wap2.mean,...,trade_300.order_count.mean,trade_150.log_return.realized_volatility,trade_150.seconds_in_bucket.count,trade_150.size.sum,trade_150.order_count.mean,tick_size,trade.tau,trade_150.tau,book.tau,real_price
0,397843,116,28527,0.003425,289.0,288.890259,0.99962,0.000797,288.842896,0.999456,...,2.3125,0.001587,22.0,1016.0,2.045455,5e-05,0.174078,0.213201,0.058824,199.254347
1,401673,118,28527,0.002233,271.0,270.716705,0.998955,0.000588,270.72644,0.998991,...,2.571429,0.001482,38.0,4262.0,2.710526,3.6e-05,0.145865,0.162221,0.060746,281.025394
2,405503,119,28527,0.001944,578.0,579.627319,1.002815,0.000327,579.623962,1.00281,...,2.666667,0.001269,40.0,11573.0,2.925,0.000199,0.121268,0.158114,0.041595,50.321584
3,409333,120,28527,0.001775,579.0,578.270325,0.99874,0.000471,578.26062,0.998723,...,2.073171,0.001527,68.0,9710.0,2.132353,0.000122,0.101015,0.121268,0.041559,81.680701
4,413163,122,28527,0.001986,467.0,466.298248,0.998497,0.001005,466.296143,0.998493,...,1.810811,0.001452,65.0,5065.0,2.030769,9.3e-05,0.110432,0.124035,0.046274,107.546256


In [4]:
len(small_data)

50000

In [5]:
# train and test split
X = small_data.iloc[:, 4:-4]
y = small_data.iloc[:,3]

split = int(len(small_data) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [17]:
X.head()

Unnamed: 0,book.seconds_in_bucket.count,book.wap1.sum,book.wap1.mean,book.wap1.std,book.wap2.sum,book.wap2.mean,book.wap2.std,book.log_return1.sum,book.log_return1.realized_volatility,book.log_return1.mean,...,trade_450.order_count.mean,trade_300.log_return.realized_volatility,trade_300.seconds_in_bucket.count,trade_300.size.sum,trade_300.order_count.mean,trade_150.log_return.realized_volatility,trade_150.seconds_in_bucket.count,trade_150.size.sum,trade_150.order_count.mean,tick_size
0,289.0,288.890259,0.99962,0.000797,288.842896,0.999456,0.000889,-0.001521,0.002882,-5.281345e-06,...,1.714286,0.001402,16.0,995.0,2.3125,0.001587,22.0,1016.0,2.045455,5e-05
1,271.0,270.716705,0.998955,0.000588,270.72644,0.998991,0.000676,0.001137,0.003899,4.209816e-06,...,2.857143,0.001154,28.0,2381.0,2.571429,0.001482,38.0,4262.0,2.710526,3.6e-05
2,578.0,579.627319,1.002815,0.000327,579.623962,1.00281,0.00036,-0.000579,0.001889,-1.003962e-06,...,2.615385,0.000953,21.0,6886.0,2.666667,0.001269,40.0,11573.0,2.925,0.000199
3,579.0,578.270325,0.99874,0.000471,578.26062,0.998723,0.000492,-9.9e-05,0.002227,-1.71315e-07,...,2.136364,0.001059,41.0,6353.0,2.073171,0.001527,68.0,9710.0,2.132353,0.000122
4,467.0,466.298248,0.998497,0.001005,466.296143,0.998493,0.001038,-0.003534,0.002505,-7.583502e-06,...,2.117647,0.001075,37.0,2480.0,1.810811,0.001452,65.0,5065.0,2.030769,9.3e-05


In [18]:
y.head()

0    0.003425
1    0.002233
2    0.001944
3    0.001775
4    0.001986
Name: target, dtype: float64

In [19]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.5 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]
      

In [7]:
def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))

    return loss

## time series one time test

In [20]:
model = xgb.XGBRegressor( eta = 0.3, max_depth = 6, gamma = 0,subsample=1) # change estimator here

tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")


RMSE: 0.0012526102374669859 (+/- 5.896185699818328e-05

R2: 0.7987505238556124 (+/- 0.022303482828491965


In [10]:
best_model = xgb.XGBRegressor( eta = 0.11, max_depth = 7, colsample_bytree = 0.7, subsample=0.89) # change estimator here

best_model.fit(X_train,y_train)

y_pred=best_model.predict(X_train)

error = rmspe(y_train, y_pred)
# errors = abs(pred - y_test)
# mape = 100 * (errors / y_test)
# accuracy = 100 - np.mean(mape)
print('RMSPE:', round(error, 4))

RMSPE: 0.2402


## blocking time series one time test


In [21]:
model = xgb.XGBRegressor( eta = 0.3, max_depth = 6, gamma = 0,subsample=1) # change estimator here

btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")



RMSE: 0.001243044774014148 (+/- 9.116085993569667e-05

R2: 0.7807936016315631 (+/- 0.015429926416364877


In [11]:
best_model_1 = xgb.XGBRegressor( eta = 0.11, max_depth = 3, colsample_bytree = 0.89999, subsample=0.7) # change estimator here

best_model_1.fit(X_train,y_train)

y_pred=best_model_1.predict(X_train)

error1 = rmspe(y_train, y_pred)
# errors = abs(pred - y_test)
# mape = 100 * (errors / y_test)
# accuracy = 100 - np.mean(mape)
print('RMSPE:', round(error1, 4))

RMSPE: 0.2811


## Cross Validation for blocking time series split

In [45]:
from sklearn.ensemble import RandomForestRegressor
from tqdm import trange

# cross validation for time series split
scores = []
xgb_r=xgb.XGBRegressor(random_state=42)



param_grid = { 
    'eta': np.arange(0.01,0.2,0.05),
    'max_depth' : np.arange(3,10,4),
    'subsample' :np.arange(0.5,1,0.2),
    'colsample_bytree':np.arange(0.5,1,0.2),
}

CV_xgb_r = GridSearchCV(estimator=xgb_r, param_grid=param_grid, cv= tscv)
    

CV_xgb_r.fit(X_train, y_train)
best_params1 = CV_xgb_r.best_params_
best_score1 = round(CV_xgb_r.best_score_,4)

### Cross Validation for time series split Parameters

In [46]:
best_params1

{'colsample_bytree': 0.7,
 'eta': 0.11,
 'max_depth': 7,
 'subsample': 0.8999999999999999}

In [47]:
best_score1

0.8184

### Time series split Test

In [61]:
# accuracy on test set
y_pred=CV_xgb_r.predict(X_test)
error1 = rmspe(y_test, y_pred)
# errors = abs(pred - y_test)
# mape = 100 * (errors / y_test)
# accuracy = 100 - np.mean(mape)
print('RMSPE:', round(error1, 4))

errors = abs(y_pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


best_model = xgb.XGBRegressor( eta = 0.11, max_depth = 7, colsample_bytree = 0.7, subsample=0.89) # change estimator here
rmse = np.sqrt(-cross_val_score(best_model, X_test, y_test, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(best_model, X_test, y_test, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")



RMSPE: 0.2659
Accuracy: 80.38 %.
RMSE: 0.0012990500144779813 (+/- 0.0003015199402314642

R2: 0.7867757386284778 (+/- 0.03369785355775719


## Cross Validation for blocking time series split


In [29]:
scores0 = []
xgb_r0=xgb.XGBRegressor(random_state=42)

param_grid = { 
    'eta': np.arange(0.01,0.2,0.05),
    'max_depth' : np.arange(3,10,4),
    'subsample' :np.arange(0.5,1,0.2),
    'colsample_bytree':np.arange(0.5,1,0.2),
}

CV_xgb_r0 = GridSearchCV(estimator=xgb_r0, param_grid=param_grid, cv= btscv)
    

CV_xgb_r0.fit(X_train, y_train)
best_params0 = CV_xgb_r0.best_params_
best_score0 = round(CV_xgb_r0.best_score_,4)

### Blocking time series parameters

In [43]:
best_params0

{'colsample_bytree': 0.8999999999999999,
 'eta': 0.11,
 'max_depth': 3,
 'subsample': 0.7}

In [44]:
best_score0

0.8051

### Blocking time series Test

In [63]:

best_model_1 = xgb.XGBRegressor( eta = 0.11, max_depth = 3, colsample_bytree = 0.89999, subsample=0.7) # change estimator here

best_model_1.fit(X_train,y_train)

y_pred=best_model_1.predict(X_test)
error = rmspe(y_test, y_pred)
print('RMSEP:', round(error, 4))

errors1 = abs(y_pred - y_test)
mape1 = 100 * (errors1 / y_test)
accuracy1 = 100 - np.mean(mape1)
print('Accuracy:', round(accuracy1, 2), '%.')


rmse_1 = np.sqrt(-cross_val_score(best_model_1, X_test, y_test, cv=btscv, scoring='neg_mean_squared_error'))
R2_1 = cross_val_score(best_model_1, X_test, y_test, cv=btscv, scoring='r2')

# errors = abs(pred - y_test)
# mape = 100 * (errors / y_test)
# accuracy = 100 - np.mean(mape)

print(f"RMSE: {rmse_1.mean()} (+/- {rmse_1.std()}")
print(f"\nR2: {R2_1.mean()} (+/- {R2_1.std()}")

RMSEP: 0.2781
Accuracy: 79.67 %.
RMSE: 0.0015777406786129408 (+/- 0.0001987526294595541

R2: 0.686232728652745 (+/- 0.06793016477808517
