In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import lightgbm as lgb
from lightgbm import LGBMRegressor

import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
# import data and data processing
df = pd.read_csv("basic_feature.csv")
df.head()
df = df[df.time_id != 34]
df = df[df.time_id != 32]
df = df[df.time_id != 4]
df.sort_values(by=["time_id", "stock_id"], inplace=True)
df = df.dropna() 
df.reset_index(inplace=True) # df is the full processed data
#small_data = df.head(5000).copy()
#small_data.reset_index(inplace=True)

In [None]:
df

In [None]:
# train and test split
X = df.iloc[:, 4:-4]
y = df.iloc[:,3]

split = int(len(df) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [None]:
# define the time series block function
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [None]:
def rmspe_score(estimator, X, y):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    y_pred = estimator.predict(X)
    loss = np.sqrt(np.mean(np.square(((y - y_pred) / y)), axis=0))

    return loss

In [None]:
# time series one time test
model = LGBMRegressor(random_state=0) # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

In [None]:
rmspe = cross_val_score(model, X_train, y_train, cv=tscv, scoring=rmspe_score)
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

In [None]:
# time series one time test
CV_lgb = LGBMRegressor(random_state=0, n_estimators = 200, learning_rate = 0.05, num_leaves = 100 ) # change estimator here
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(CV_lgb, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_lgb, X_train, y_train, cv=tscv, scoring='r2')
rmspe = cross_val_score(CV_lgb, X_train, y_train, cv=tscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

In [None]:
CV_lgb.feature_importances_

In [None]:
# blocking time series one time test
model =  LGBMRegressor(random_state=0)
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")



In [None]:
# blocking time series one time test
CV_lgb2 =  LGBMRegressor(random_state=0, n_estimators = 200, learning_rate = 0.04, num_leaves = 120, min_data_in_leaf = 200)
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(CV_lgb2, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(CV_lgb2, X_train, y_train, cv=btscv, scoring='r2')
rmspe = cross_val_score(CV_lgb2, X_train, y_train, cv=btscv, scoring=rmspe_score)

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

In [None]:
# cross validation for time series split
model=LGBMRegressor(random_state=0,verbose= 0)
tscv = TimeSeriesSplit(n_splits=5)
param_grid = { 
    'n_estimators': [10,100,150,200],
    'max_depth':[-1,20,50,100],
     'learning_rate': [0.01,0.05,0.08,0.1],
    'num_leaves': [50,100,120,150]
}

CV_lgb = GridSearchCV(estimator=model, param_grid=param_grid, cv= tscv)
    

CV_lgb.fit(X_train, y_train)
best_params0 = CV_lgb.best_params_
best_score0 = round(CV_lgb.best_score_,4)




In [None]:
best_params0

In [None]:
CV_lgb = LGBMRegressor(random_state=0, n_estimators = 200, learning_rate = 0.05, num_leaves = 100 )

In [None]:
# accuracy on test set
CV_lgb.fit(X_train, y_train)
pred=CV_lgb.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# performance on test set 
R2 = cross_val_score(CV_lgb, X_test, y_test, scoring='r2')
rmse = np.sqrt(-cross_val_score(CV_lgb, X_test, y_test, scoring='neg_mean_squared_error'))
rmspe = cross_val_score(CV_lgb, X_test, y_test, scoring=rmspe_score)
print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")

In [None]:
# cross validation for blocking time series split
model=LGBMRegressor(random_state=0,verbose= 0)

param_grid = { 
    'n_estimators': [50,100,150,200],
    'max_depth':[-1,20,50,100],
    'learning_rate': [0.02,0.04,0.08,0.1],
    'num_leaves': [50,100,120,150],
     'min_data_in_leaf' = [100,200,500]
}

CV_lgb = GridSearchCV(estimator=model, param_grid=param_grid, cv= btscv)
    

CV_lgb.fit(X_train, y_train)
best_params = CV_lgb.best_params_
best_score = round(CV_lgb.best_score_,4)


In [None]:
best_params

In [None]:
CV_lgb2 =  LGBMRegressor(random_state=0, n_estimators = 200, learning_rate = 0.04, num_leaves = 120, min_data_in_leaf = 200)

In [None]:
CV_lgb2.fit(X_train, y_train)
pred=CV_lgb2.predict(X_test)
errors = abs(pred - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# performance on test set 
R2 = cross_val_score(CV_lgb2, X_test, y_test, scoring='r2')
rmse = np.sqrt(-cross_val_score(CV_lgb2, X_test, y_test, scoring='neg_mean_squared_error'))
rmspe = cross_val_score(CV_lgb2, X_test, y_test, scoring=rmspe_score)
print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")
print(f"RMSPE: {rmspe.mean()} (+/- {rmspe.std()}")