## Cross-Validation Strategies for Time Series Forecasting  

https://medium.com/sci-net/cross-validation-strategies-for-time-series-forecasting-9e6cfab91f60

In [1]:
import numpy as np
import pandas as pd

I have used a lag of 64 days for regressors and a target of 8 days for responses. That is given the past 64 days closing prices forecast the next 8 days. 

In [6]:
df = pd.read_csv('data/Gemini_ETHUSD_d.csv', skiprows=1)

STEPS= 60

for i in range(1, STEPS):
    
    col_name = 'd{}'.format(i)
    df[col_name] = df['d0'].shift(periods=-1 * i)

df = df.dropna()
df

KeyError: 'd0'

In [5]:
col_name = 'd{}'.format(1)
col_name

'd1'

In [None]:
X = df.iloc[:, :TRAIN_STEPS]
y = df.iloc[:, TRAIN_STEPS:]

X_train = X.iloc[:SPLIT_IDX, :]
y_train = y.iloc[:SPLIT_IDX, :]

X_test = X.iloc[SPLIT_IDX:, :]
y_test = y.iloc[SPLIT_IDX:, :]

In [None]:
def build_model(_alpha, _l1_ratio):
    
    estimator = ElasticNet(
        alpha=_alpha,
        l1_ratio=_l1_ratio,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        max_iter=16,
        copy_X=True,
        tol=0.1,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='random'
    )

    return MultiOutputRegressor(estimator, n_jobs=4)

In [None]:
model = build_model(_alpha=1.0, _l1_ratio=0.3)
kfcv = KFold(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=kfcv, scoring=r2)
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

In [None]:
model = build_model(_alpha=1.0, _l1_ratio=0.3)
tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring=r2)
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

In [None]:
class BlockingTimeSeriesSplit():
    
    def __init__(self, n_splits):
        
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        
        for i in range(self.n_splits):
            
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [None]:
model = build_model(_alpha=1.0, _l1_ratio=0.3)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=btscv, scoring=r2)
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

In [None]:
params = {
    'estimator__alpha':(0.1, 0.3, 0.5, 0.7, 0.9),
    'estimator__l1_ratio':(0.1, 0.3, 0.5, 0.7, 0.9)
}

for i in range(100):
    
    model = build_model(_alpha=1.0, _l1_ratio=0.3)

    finder = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring=r2,
        fit_params=None,
        n_jobs=None,
        iid=False,
        refit=False,
        cv=kfcv,  # change this to the splitter subject to test
        verbose=1,
        pre_dispatch=8,
        error_score=-999,
        return_train_score=True
    )

    finder.fit(X_train, y_train)

    best_params = finder.best_params_

In [None]:
# optimal model
model = build_model(_alpha=0.1, _l1_ratio=0.1)

# train model
model.fit(X_train, y_train)

# test score
y_predicted = model.predict(X_test)
score = r2_score(y_test, y_predicted, multioutput='uniform_average')

print("Test Loss: {0:.3f}".format(score))