In [1]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 
from tqdm import trange
import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 
# df.reset_index(inplace=True) # df is the full processed data
small_data = df.head(5000).copy()
small_data = df.copy()
small_data.reset_index(inplace=True)

In [3]:
# train and test split
X = small_data.iloc[:, 4:-4]
y = small_data.iloc[:,3]

split = int(len(small_data) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [4]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]
            
def rmspe_score(estimator, X, y):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    y_pred = estimator.predict(X)
    loss = np.sqrt(np.mean(np.square(((y - y_pred) / y)), axis=0))

    return loss

In [53]:
# time series one time test
model = RandomForestRegressor(n_estimators=50) # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0012888884966440934 (+/- 5.607313682553805e-05

R2: 0.806965683043155 (+/- 0.016942264267262787


In [54]:
# blocking time series one time test
model = RandomForestRegressor(n_estimators=50)
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")




RMSE: 0.0013747790340590204 (+/- 0.00018207947611048532

R2: 0.8073503461818203 (+/- 0.013294027030116803


In [5]:
btscv = BlockingTimeSeriesSplit(n_splits=5)
tscv = TimeSeriesSplit(n_splits=5)

In [6]:
# cross validation for time series split
starttime = datetime.datetime.now()
scores = []
rfc = RandomForestRegressor(random_state=42)
param_grid = { 
    'n_estimators': [5,10,20],
    'max_depth' : [4,6,8]
}
param_grid = {
    'n_estimators': np.array([5, 10, 20]),
    'max_depth': [None] + list(np.array([3, 6, 10])),
    'max_features': ['auto', 'sqrt', None],
    'max_leaf_nodes': [None] + list(np.array([10])),
    'min_samples_split': [2],
    'bootstrap': [True, False]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=tscv, n_jobs=-1)
    

CV_rfc.fit(X_train, y_train)
best_params0 = CV_rfc.best_params_
best_score0 = round(CV_rfc.best_score_,4)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)



11555


In [7]:
best_params0

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_samples_split': 2,
 'n_estimators': 20}

In [8]:
best_score0

0.8063

In [9]:
# accuracy on test set
pred=CV_rfc.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 79.6 %.


In [6]:
# cross validation for blocking time series split
starttime = datetime.datetime.now()
scores0 = []
rfc=RandomForestRegressor(random_state=42)
param_grid = { 
    'n_estimators': [5,10,20],
    'max_depth' : [4,6,8]
}
param_grid = {
    'n_estimators': np.array([5, 10, 20]),
    'max_depth': [None] + list(np.array([3, 6, 10])),
    'max_features': ['auto', 'sqrt', None],
    'max_leaf_nodes': [None] + list(np.array([10])),
    'min_samples_split': [2],
    'bootstrap': [True, False]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= btscv, n_jobs=-1)
    

CV_rfc.fit(X_train, y_train)
best_params0 = CV_rfc.best_params_
best_score0 = round(CV_rfc.best_score_,4)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)



3130


In [7]:
best_params0

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_samples_split': 2,
 'n_estimators': 20}

In [8]:
best_score0

0.8077

In [9]:
pred = CV_rfc.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 79.62 %.


In [11]:
# time series one time test
model = RandomForestRegressor(n_estimators=20,
                              bootstrap=False,
                                 max_depth=10,
                                 max_features= 'sqrt',
                                 max_leaf_nodes= None,
                                 min_samples_split=2) # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
rmspe1 = cross_val_score(model, X_train, y_train, cv=tscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0012903429922028553 (+/- 6.573430638641215e-05
RMSPE: 0.2871875217121766 (+/- 0.011678126551927147
R2: 0.806037781550508 (+/- 0.01521774466403363


In [12]:
# blocking time series one time test
model = RandomForestRegressor(n_estimators=20,
                             bootstrap=True,
 max_depth=10,
 max_features= 'sqrt',
 max_leaf_nodes=None,
 min_samples_split=2)
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')
rmspe1 = cross_val_score(model, X_train, y_train, cv=btscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"RMSPE: {rmspe1.mean()} (+/- {rmspe1.std()}")
print(f"R2: {R2.mean()} (+/- {R2.std()}")

RMSE: 0.0013800998520051148 (+/- 0.00019534878769453018
RMSPE: 0.30623768653244676 (+/- 0.038053357428005086
R2: 0.8056468988118383 (+/- 0.015499334149333589
