In [24]:
from math import sqrt
from numpy import mean
from numpy import median
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from sklearn.metrics import mean_squared_error
from pandas import read_csv

In [25]:
# one-step simple forecast
def simple_forecast(history, config):
    n, offset, avg_type = config
    # persist value, ignore other config
    if avg_type == 'persist':
        return history[-n]
    # collect values to average
    values = list()
    if offset == 1:
        values = history[-n:]
    else:
        # skip bad configs
        if n*offset > len(history):
            raise Exception('Config beyond end of data: %d %d' % (n,offset))
        # try and collect n values using offset
        for i in range(1, n+1):
            ix = i * offset
            values.append(history[-ix])
    # check if we can average
    if len(values) < 2:
        raise Exception('Cannot calculate average')
    # mean of last n values
    if avg_type == 'mean':
        return mean(values)
    # median of last n values
    return median(values)

In [26]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [27]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

In [28]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = simple_forecast(history, cfg)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
    # estimate prediction error
    error = measure_rmse(test, predictions)
    return error

In [29]:
# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
    result = None
    # convert config to a key
    key = str(cfg)
    # show all warnings and fail on exception if debugging
    if debug:
        result = walk_forward_validation(data, n_test, cfg)
    else:
        # one failure during model validation suggests an unstable config
        try:
            # never show warnings when grid searching, too noisy
            with catch_warnings():
                filterwarnings("ignore")
                result = walk_forward_validation(data, n_test, cfg)
        except:
            error = None
    # check for an interesting result
    if result is not None:
        print(' > Model[%s] %.3f' % (key, result))
    return (key, result)

In [30]:
# grid search configs
def grid_search(data, cfg_list, n_test, parallel=True):
    scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
    # remove empty results
    scores = [r for r in scores if r[1] != None]
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores

In [17]:
# create a set of simple configs to try
def simple_configs(max_length, offsets=[1]):
    configs = list()
    for i in range(1, max_length+1):
        for o in offsets:
            for t in ['persist', 'mean', 'median']:
                cfg = [i, o, t]
                configs.append(cfg)
    return configs

In [19]:
# define dataset
series = read_csv('daily-total-female-births.csv', header=0, index_col=0)
data = series.values

In [20]:
# data split
n_test = 165

# model configs
max_length = len(data) - n_test
cfg_list = simple_configs(max_length)

In [33]:
cfg_list

[[1, 1, 'persist'],
 [1, 1, 'mean'],
 [1, 1, 'median'],
 [2, 1, 'persist'],
 [2, 1, 'mean'],
 [2, 1, 'median'],
 [3, 1, 'persist'],
 [3, 1, 'mean'],
 [3, 1, 'median'],
 [4, 1, 'persist'],
 [4, 1, 'mean'],
 [4, 1, 'median'],
 [5, 1, 'persist'],
 [5, 1, 'mean'],
 [5, 1, 'median'],
 [6, 1, 'persist'],
 [6, 1, 'mean'],
 [6, 1, 'median'],
 [7, 1, 'persist'],
 [7, 1, 'mean'],
 [7, 1, 'median'],
 [8, 1, 'persist'],
 [8, 1, 'mean'],
 [8, 1, 'median'],
 [9, 1, 'persist'],
 [9, 1, 'mean'],
 [9, 1, 'median'],
 [10, 1, 'persist'],
 [10, 1, 'mean'],
 [10, 1, 'median'],
 [11, 1, 'persist'],
 [11, 1, 'mean'],
 [11, 1, 'median'],
 [12, 1, 'persist'],
 [12, 1, 'mean'],
 [12, 1, 'median'],
 [13, 1, 'persist'],
 [13, 1, 'mean'],
 [13, 1, 'median'],
 [14, 1, 'persist'],
 [14, 1, 'mean'],
 [14, 1, 'median'],
 [15, 1, 'persist'],
 [15, 1, 'mean'],
 [15, 1, 'median'],
 [16, 1, 'persist'],
 [16, 1, 'mean'],
 [16, 1, 'median'],
 [17, 1, 'persist'],
 [17, 1, 'mean'],
 [17, 1, 'median'],
 [18, 1, 'persist'],
 [1

In [22]:
# grid search
scores = grid_search(data, cfg_list, n_test)
print('done')

 > Model[[1, 1, 'persist']] 8.722
 > Model[[2, 1, 'persist']] 9.284
 > Model[[2, 1, 'mean']] 7.884
 > Model[[2, 1, 'median']] 7.884
 > Model[[3, 1, 'persist']] 9.320
 > Model[[3, 1, 'mean']] 7.518
 > Model[[3, 1, 'median']] 7.324
 > Model[[4, 1, 'persist']] 9.249
 > Model[[4, 1, 'mean']] 7.290
 > Model[[4, 1, 'median']] 7.217
 > Model[[5, 1, 'persist']] 9.415
 > Model[[5, 1, 'mean']] 7.204
 > Model[[5, 1, 'median']] 7.337
 > Model[[6, 1, 'persist']] 9.737
 > Model[[6, 1, 'mean']] 7.222
 > Model[[6, 1, 'median']] 7.230
 > Model[[7, 1, 'persist']] 8.791
 > Model[[7, 1, 'mean']] 7.042
 > Model[[7, 1, 'median']] 7.340
 > Model[[8, 1, 'persist']] 10.172
 > Model[[8, 1, 'mean']] 7.140
 > Model[[8, 1, 'median']] 7.331
 > Model[[9, 1, 'persist']] 10.175
 > Model[[9, 1, 'mean']] 7.200
 > Model[[9, 1, 'median']] 7.302
 > Model[[10, 1, 'persist']] 9.956
 > Model[[10, 1, 'mean']] 7.209
 > Model[[10, 1, 'median']] 7.349
 > Model[[11, 1, 'persist']] 9.873
 > Model[[11, 1, 'mean']] 7.201
 > Model[[11

In [23]:
# list top 3 configs
for cfg, error in scores[:3]:
    print(cfg, error)

[22, 1, 'mean'] 6.930411499775709
[23, 1, 'mean'] 6.932293117115201
[21, 1, 'mean'] 6.951918385845375
