In [47]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 
from tqdm import trange
import sklearn
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, LassoCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor



from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler  

import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 

In [23]:
# # train and test split
X = df.iloc[:, 3:-4]
y = df.iloc[:,2]

split = int(len(df) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [5]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]
            
def rmspe_score(estimator, X, y):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    y_pred = estimator.predict(X)
    loss = np.sqrt(np.mean(np.square(((y - y_pred) / y)), axis=0))

    return loss

In [6]:
# time series one time test
model = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=10), n_estimators=15, learning_rate = 0.04) # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
rmspe1 = cross_val_score(model, X_train, y_train, cv=tscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

In [7]:
# blocking time series one time test
model = AdaBoostRegressor(n_estimators=50) # change estimator here
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')
rmspe1 = cross_val_score(model, X_train, y_train, cv=btscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

In [8]:
# cross validation for time series split
starttime = datetime.datetime.now()
scores = []
adaboost = AdaBoostRegressor()
param_grid = { 
    'n_estimators': [13, 15, 17, 20,25,30,50],
    'learning_rate': [0.02, 0.03, 0.04, 0.05, 0.08,1]
}

CV_adaboost = GridSearchCV(estimator=adaboost, param_grid=param_grid, cv=tscv, n_jobs=-1)
    

CV_adaboost.fit(X_train, y_train)
best_params0 = CV_adaboost.best_params_
best_score0 = round(CV_adaboost.best_score_,4)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)



7


In [9]:
best_params0

{'n_estimators': 5}

In [10]:
# accuracy on test set
pred=CV_adaboost.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 77.93 %.


In [11]:
rmse = np.sqrt(-cross_val_score(CV_adaboost, X_test, y_test, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_adaboost, X_test, y_test, scoring='r2')
rmspe1 = cross_val_score(CV_adaboost, X_test, y_test, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0019355492614362535 (+/- 0.0004101998101725369
RMSPE: 0.3797029947463796 (+/- 0.11876952871714072
R2: 0.6362143666743376 (+/- 0.13475469686056257


In [43]:
# cross validation for blocking time series split
starttime = datetime.datetime.now()
scores = []
adaboost = AdaBoostRegressor()
# param_grid = { 
#     'n_estimators': [3, 4, 5, 7, 10, 15],
#     'learning_rate': [0.02,0.04,0.08,0.1]
# }

param_grid = { 
    'n_estimators': [20, 30, 50, 100, 150, 200],
    'learning_rate': [0.001, 0.01, 0.04, 0.08, 0.1, 0.5, 1, 2]
}

CV_adaboost = GridSearchCV(estimator=adaboost, param_grid=param_grid, cv=btscv, n_jobs=-1)
    

CV_adaboost.fit(X_train, y_train)
best_params0 = CV_adaboost.best_params_
best_score0 = round(CV_adaboost.best_score_,4)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)


384


In [44]:
best_params0

{'learning_rate': 0.04, 'n_estimators': 50}

In [61]:
CV_adaboost = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=10), n_estimators=15, learning_rate = 0.04)
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(CV_adaboost, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_adaboost, X_train, y_train, cv=tscv, scoring='r2')
rmspe1 = cross_val_score(CV_adaboost, X_train, y_train, cv=tscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")


CV_adaboost.fit(X_train, y_train)

pred=CV_adaboost.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

rmse = np.sqrt(-cross_val_score(CV_adaboost, X_test, y_test,scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_adaboost, X_test, y_test, scoring='r2')
rmspe1 = cross_val_score(CV_adaboost, X_test, y_test, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0012181915478959708 (+/- 5.950669579260648e-05
RMSPE: 0.2691257449087742 (+/- 0.013129589881275142
R2: 0.8107992451137311 (+/- 0.01873333407209977
Accuracy: 80.33 %.
RMSE: 0.0012940195004618308 (+/- 0.00027013676363123597
RMSPE: 0.26115908329468673 (+/- 0.009574116879417414
R2: 0.7901220136419506 (+/- 0.028360895362474013


In [63]:
CV_adaboost = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=11), n_estimators=20, learning_rate = 0.08)
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(CV_adaboost, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_adaboost, X_train, y_train, cv=btscv, scoring='r2')
rmspe1 = cross_val_score(CV_adaboost, X_train, y_train, cv=btscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")


CV_adaboost.fit(X_train, y_train)

pred=CV_adaboost.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

rmse = np.sqrt(-cross_val_score(CV_adaboost, X_test, y_test, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_adaboost, X_test, y_test, scoring='r2')
rmspe1 = cross_val_score(CV_adaboost, X_test, y_test, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0011291532774531268 (+/- 0.00018365996623222173
RMSPE: 0.2744289061161088 (+/- 0.055783087631993605
R2: 0.7985891779026139 (+/- 0.031258922412361285
Accuracy: 80.45 %.
RMSE: 0.0012800806623624063 (+/- 0.0002636545292355239
RMSPE: 0.2595231451304533 (+/- 0.011548983006674821
R2: 0.7941980664724797 (+/- 0.02266933151162883
