In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from tqdm import trange

In [None]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 

In [None]:
small_data

Unnamed: 0,index,stock_id,time_id,target,book.seconds_in_bucket.count,book.wap1.sum,book.wap1.mean,book.wap1.std,book.wap2.sum,book.wap2.mean,...,trade_300.order_count.mean,trade_150.log_return.realized_volatility,trade_150.seconds_in_bucket.count,trade_150.size.sum,trade_150.order_count.mean,tick_size,trade.tau,trade_150.tau,book.tau,real_price
0,0,0,5,0.004136,302.0,303.125061,1.003725,0.000693,303.105530,1.003661,...,2.571429,0.001701,30.0,2069.0,2.433333,0.000052,0.158114,0.182574,0.057544,193.732286
1,3830,1,5,0.006340,575.0,577.061646,1.003585,0.001174,577.083130,1.003623,...,3.244898,0.002754,78.0,7404.0,3.141026,0.000066,0.103142,0.113228,0.041703,152.520145
2,7660,2,5,0.001848,583.0,583.834351,1.001431,0.000543,583.848999,1.001456,...,3.500000,0.001092,114.0,21407.0,3.447368,0.000081,0.079305,0.093659,0.041416,123.543564
3,11490,3,5,0.005300,510.0,509.902588,0.999809,0.000804,509.936371,0.999875,...,3.972973,0.003087,79.0,16865.0,4.620253,0.000044,0.091670,0.112509,0.044281,226.108032
4,15320,4,5,0.004468,395.0,396.180298,1.002988,0.000630,396.152283,1.002917,...,2.862069,0.001834,44.0,2829.0,2.659091,0.000016,0.125988,0.150756,0.050315,621.378370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,356206,105,381,0.001830,261.0,261.079132,1.000303,0.000250,261.075745,1.000290,...,5.650000,0.001149,29.0,8510.0,5.310345,0.000202,0.158114,0.185695,0.061898,49.431986
4996,360036,107,381,0.001719,298.0,298.106903,1.000359,0.000664,298.072266,1.000243,...,2.764706,0.001086,30.0,3239.0,3.066667,0.000110,0.162221,0.182574,0.057928,91.279739
4997,363866,108,381,0.001952,586.0,585.958801,0.999930,0.000656,585.941040,0.999899,...,6.750000,0.001178,39.0,34979.0,5.897436,0.000298,0.121268,0.160128,0.041310,33.541016
4998,367696,109,381,0.003325,229.0,228.869080,0.999428,0.000452,228.847916,0.999336,...,2.842105,0.000962,25.0,3232.0,3.000000,0.000034,0.169031,0.200000,0.066082,293.307972


In [None]:
# train and test split
X = df.iloc[:, 3:-4]
y = df.iloc[:,2]

# split = int(len(small_data) * 0.8)

# X_train = X[:split]
# y_train = y[:split]

# X_test = X[split:]
# y_test = y[split:]

In [None]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [None]:
def rmspe_score(estimator, X, y):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    y_pred = estimator.predict(X)
    loss = np.sqrt(np.mean(np.square(((y - y_pred) / y)), axis=0))

    return loss

In [None]:
# time series one time test
model = LinearRegression() # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X, y, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X, y, cv=tscv, scoring='r2')
rmspe = cross_val_score(model, X, y, cv=tscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"\nRMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

RMSE: 0.001282879591966683 (+/- 5.4634028272414716e-05

R2: 0.8054444059780085 (+/- 0.014998377267878569

RMSPE: 0.2979117952224922 (+/- 0.006018571238386129


In [None]:
# blcok time series one time test
model = LinearRegression() # change estimator here
bscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X, y, cv=bscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X, y, cv=bscv, scoring='r2')
rmspe = cross_val_score(model, X, y, cv=bscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")

print(f"\nRMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

RMSE: 0.0012787908209658406 (+/- 8.241158828527712e-05

R2: 0.8083180597305333 (+/- 0.01446440733806704

RMSPE: 0.2996188702050132 (+/- 0.012622552587340926


In [None]:
split = int(len(df) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

lm = model.fit(X_train, y_train)
y_pred = lm.predict(X_test)

pd.DataFrame([y_pred, y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85437,85438,85439,85440,85441,85442,85443,85444,85445,85446
0,566.004295,291.006971,203.004389,253.965895,377.866945,583.063664,479.941535,319.932624,367.375738,594.953007,...,220.011193,291.002119,246.013742,521.005976,542.990979,316.983982,508.993885,590.972939,383.997752,216.991556
1,566.0,291.0,203.0,254.0,378.0,583.0,480.0,320.0,367.0,595.0,...,220.0,291.0,246.0,521.0,543.0,317.0,509.0,591.0,384.0,217.0


In [None]:
X.head()

Unnamed: 0,book.wap1.sum,book.wap1.mean,book.wap1.std,book.wap2.sum,book.wap2.mean,book.wap2.std,book.log_return1.sum,book.log_return1.realized_volatility,book.log_return1.mean,book.log_return1.std,...,trade_450.order_count.mean,trade_300.log_return.realized_volatility,trade_300.seconds_in_bucket.count,trade_300.size.sum,trade_300.order_count.mean,trade_150.log_return.realized_volatility,trade_150.seconds_in_bucket.count,trade_150.size.sum,trade_150.order_count.mean,tick_size
0,303.125061,1.003725,0.000693,303.10553,1.003661,0.000781,0.002292,0.004499,8e-06,0.00026,...,2.642857,0.001308,21.0,1587.0,2.571429,0.001701,30.0,2069.0,2.433333,5.2e-05
3830,577.061646,1.003585,0.001174,577.08313,1.003623,0.001213,0.002342,0.006245,4e-06,0.000261,...,3.5,0.00213,49.0,3163.0,3.244898,0.002754,78.0,7404.0,3.141026,6.6e-05
7660,583.834351,1.001431,0.000543,583.848999,1.001456,0.000556,0.001688,0.001787,3e-06,7.4e-05,...,3.761905,0.000971,80.0,15847.0,3.5,0.001092,114.0,21407.0,3.447368,8.1e-05
11490,509.902588,0.999809,0.000804,509.936371,0.999875,0.000814,-0.00103,0.006113,-2e-06,0.000271,...,4.611111,0.002469,37.0,6146.0,3.972973,0.003087,79.0,16865.0,4.620253,4.4e-05
15320,396.180298,1.002988,0.00063,396.152283,1.002917,0.000758,0.002126,0.00564,5e-06,0.000284,...,2.684211,0.001589,29.0,2259.0,2.862069,0.001834,44.0,2829.0,2.659091,1.6e-05
