In [2]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 
from tqdm import trange
import sklearn
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, LassoCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler  

import datetime

import warnings
warnings.filterwarnings('ignore')

In [3]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 

In [4]:
df.head(5)

Unnamed: 0,stock_id,time_id,target,book.seconds_in_bucket.count,book.wap1.sum,book.wap1.mean,book.wap1.std,book.wap2.sum,book.wap2.mean,book.wap2.std,...,trade_300.order_count.mean,trade_150.log_return.realized_volatility,trade_150.seconds_in_bucket.count,trade_150.size.sum,trade_150.order_count.mean,tick_size,trade.tau,trade_150.tau,book.tau,real_price
0,0,5,0.004136,302.0,303.125061,1.003725,0.000693,303.10553,1.003661,0.000781,...,2.571429,0.001701,30.0,2069.0,2.433333,5.2e-05,0.158114,0.182574,0.057544,193.732286
3830,1,5,0.00634,575.0,577.061646,1.003585,0.001174,577.08313,1.003623,0.001213,...,3.244898,0.002754,78.0,7404.0,3.141026,6.6e-05,0.103142,0.113228,0.041703,152.520145
7660,2,5,0.001848,583.0,583.834351,1.001431,0.000543,583.848999,1.001456,0.000556,...,3.5,0.001092,114.0,21407.0,3.447368,8.1e-05,0.079305,0.093659,0.041416,123.543564
11490,3,5,0.0053,510.0,509.902588,0.999809,0.000804,509.936371,0.999875,0.000814,...,3.972973,0.003087,79.0,16865.0,4.620253,4.4e-05,0.09167,0.112509,0.044281,226.108032
15320,4,5,0.004468,395.0,396.180298,1.002988,0.00063,396.152283,1.002917,0.000758,...,2.862069,0.001834,44.0,2829.0,2.659091,1.6e-05,0.125988,0.150756,0.050315,621.37837


In [5]:
# train and test split
X = df.iloc[:, 3:-4]
y = df.iloc[:,2]

split = int(len(df) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

# ss=StandardScaler() 
# ss.fit(X_train)
# X_train = ss.transform(X_train)
# X_test = ss.transform(X_test)

In [6]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]
            
def rmspe_score(estimator, X, y):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    y_pred = estimator.predict(X)
    loss = np.sqrt(np.mean(np.square(((y - y_pred) / y)), axis=0))

    return loss

In [10]:
# time series one time test
# model = Lasso() # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
# rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
# R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
# rmspe1 = cross_val_score(model, X_train, y_train, cv=tscv, scoring=rmspe_score)

# print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
# print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
# print(f"R2: {R2.mean()} (+/- {R2.std()}")

In [11]:
# blocking time series one time test
# model = Lasso(alpha=0.1) # change estimator here
btscv = BlockingTimeSeriesSplit(n_splits=5)
# rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
# R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')
# rmspe1 = cross_val_score(model, X_train, y_train, cv=btscv, scoring=rmspe_score)

# print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
# print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
# print(f"R2: {R2.mean()} (+/- {R2.std()}")

In [21]:
# cross validation for time series split
starttime = datetime.datetime.now()

lasso_alphas = np.linspace(0, 0.05, 21)
lasso = LassoCV(alphas=lasso_alphas, cv=tscv, n_jobs=-1)
lasso.fit(X_train, y_train)
best_params0 = lasso.alpha_
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)



263


In [22]:
best_params0

0.0

In [23]:
# accuracy on test set
pred=lasso.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

rmse = np.sqrt(-cross_val_score(lasso, X_test, y_test, scoring='neg_mean_squared_error'))
R2 = cross_val_score(lasso, X_test, y_test, scoring='r2')
rmspe1 = cross_val_score(lasso, X_test, y_test, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

Accuracy: 78.13 %.
RMSE: 0.001197522313830859 (+/- 8.1448201411084e-05
RMSPE: 0.2916903996290261 (+/- 0.005126228215021463
R2: 0.8124597510002607 (+/- 0.010088335796412773


In [24]:
# cross validation for blocking time series split
starttime = datetime.datetime.now()
lasso_alphas = np.linspace(0, 0.05, 21)
lasso = LassoCV(alphas=lasso_alphas, cv=btscv, n_jobs=-1)
lasso.fit(X_train, y_train)
best_params0 = lasso.alpha_
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)

222


In [25]:
best_params0

0.0

In [26]:
pred = lasso.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

rmse = np.sqrt(-cross_val_score(lasso, X_test, y_test, scoring='neg_mean_squared_error'))
R2 = cross_val_score(lasso, X_test, y_test, scoring='r2')
rmspe1 = cross_val_score(lasso, X_test, y_test, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

Accuracy: 78.13 %.
RMSE: 0.001197522313830859 (+/- 8.1448201411084e-05
RMSPE: 0.2916903996290261 (+/- 0.005126228215021463
R2: 0.8124597510002607 (+/- 0.010088335796412773
