In [34]:
# Dataframe manipulation
import pandas as pd
# Linear algebra
import numpy as np
# Data visualization with plotnine
#from plotnine import *
#import plotnine
# Autocorrelation
import matplotlib.pyplot as plt
import statsmodels.api as sm
# Table styling
import seaborn as sns
cm = sns.light_palette('green', as_cmap = True)
# Cross validation and data partitioning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
# Grid-search
from sklearn.model_selection import GridSearchCV
# Data modelling
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# Evaluation
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

In [68]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


In [85]:
df = pd.read_csv('data/20days_supervised.csv')

In [86]:
df['target'] = df['target_1']+df['target_2']*2+df['target_3']*3
df['target'] = np.where(df['target']==3.0,0.0,df['target'])

In [87]:

X = df[df.columns[~df.columns.isin(
    ['datecol', 'target','trade_date','target_1','target_2','target_3'])]]
y = df['target']


In [88]:

# Split into training and testing data
X_train = X.iloc[:(len(df) - 100), :]
X_test = X.iloc[(len(df) - 100):, :]
y_train = y.iloc[:(len(df) - 100)]
y_test = y.iloc[(len(df) - 100):]


In [89]:
scaler = MinMaxScaler(feature_range=(0, 1))


X_scale = scaler.fit_transform(X)
#y_scale = scaler.fit_transform(y.values.reshape(-1, 1))

#PCA
pca = PCA(n_components=0.95)
X_scale_pca = pca.fit_transform(X_scale)
X_scale_pca = scaler.fit_transform(X_scale_pca)

In [90]:
np.shape(X_scale_pca)

(1943, 376)

In [91]:
# Function for non-anchored walk-forward optimization
class NonAnchoredTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y = None, groups = None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)
        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]



In [92]:
# Fit the model
ntscv = NonAnchoredTimeSeriesSplit(n_splits = 50)
scores = cross_val_score(
    estimator = XGBClassifier(),
    X = X_pca,
    y = y,
    cv = ntscv
)

# Result - non-anchored walk-forward optimization
print('MSE: {0:.3f} (+/- {1:.3f})'.format(
    scores.mean(),
    scores.std())
)

MSE: 0.395 (+/- 0.181)


In [93]:
scores

array([0.25 , 0.5  , 0.375, 0.375, 0.375, 0.25 , 0.375, 0.5  , 0.625,
       0.25 , 0.375, 0.875, 0.375, 0.25 , 0.5  , 0.125, 0.75 , 0.625,
       0.5  , 0.5  , 0.25 , 0.125, 0.75 , 0.25 , 0.375, 0.375, 0.125,
       0.375, 0.375, 0.375, 0.5  , 0.25 , 0.625, 0.25 , 0.5  , 0.75 ,
       0.625, 0.125, 0.25 , 0.125, 0.25 , 0.375, 0.125, 0.375, 0.5  ,
       0.625, 0.375, 0.25 , 0.375, 0.375])

In [96]:
# Fit the model
tscv = TimeSeriesSplit(n_splits = 50)
scores = cross_val_score(
    estimator = XGBClassifier(),
    X = X_pca,
    y = y,
    cv = tscv
)

# Result - non-anchored walk-forward optimization
print('MSE: {0:.3f} (+/- {1:.3f})'.format(
    scores.mean(),
    scores.std())
)

MSE: 0.394 (+/- 0.075)


In [97]:
scores

array([0.34210526, 0.42105263, 0.28947368, 0.34210526, 0.23684211,
       0.42105263, 0.5       , 0.39473684, 0.34210526, 0.34210526,
       0.42105263, 0.31578947, 0.23684211, 0.44736842, 0.47368421,
       0.47368421, 0.31578947, 0.47368421, 0.36842105, 0.39473684,
       0.31578947, 0.39473684, 0.28947368, 0.34210526, 0.52631579,
       0.42105263, 0.36842105, 0.42105263, 0.47368421, 0.44736842,
       0.47368421, 0.23684211, 0.42105263, 0.36842105, 0.57894737,
       0.47368421, 0.39473684, 0.44736842, 0.52631579, 0.34210526,
       0.36842105, 0.36842105, 0.44736842, 0.34210526, 0.36842105,
       0.39473684, 0.39473684, 0.47368421, 0.39473684, 0.31578947])

In [98]:
# Fit the model
tscv = TimeSeriesSplit(n_splits = 50)
scores = cross_val_score(
    estimator = XGBClassifier(),
    X = X,
    y = y,
    cv = tscv
)

# Result - non-anchored walk-forward optimization
print('MSE: {0:.3f} (+/- {1:.3f})'.format(
    scores.mean(),
    scores.std())
)

MSE: 0.443 (+/- 0.085)
