In [1]:
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))
# statsmodels
import statsmodels
print('statsmodels: {}'.format(statsmodels.__version__))

scipy: 1.1.0
numpy: 1.14.2
matplotlib: 2.1.2
pandas: 0.20.3
sklearn: 0.20.2
statsmodels: 0.8.0


In [9]:
import sys
import pandas as pd
from pandas import Series

import numpy as np
from numpy import log

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf, pacf

from pandas.core import datetools

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from pandas.plotting import lag_plot


from sklearn.metrics import mean_squared_error
from math import sqrt

# Problem Description

The problem is to predict the number of monthly armed robberies in Boston, USA.

The dataset provides the number of monthly armed robberies in Boston from January 1966 to October 1975, or just under 10 years of data.

The values are a count and there are 118 observations.

The dataset is credited to McCleary & Hay (1980).

In [4]:
location = r'E:\MYLEARN\2-ANALYTICS-DataScience\datasets\boston-robberies.csv'

In [8]:
series = Series.from_csv(location, header=0)

split_point = len(series) - 12
dataset, validation = series[0:split_point], series[split_point:]

print('Dataset %d, Validation %d' % (len(dataset), len(validation)))

dataset.to_csv(r'E:\MYLEARN\2-ANALYTICS-DataScience\datasets\dataset.csv')
validation.to_csv(r'E:\MYLEARN\2-ANALYTICS-DataScience\datasets\'validation.csv')

Dataset 106, Validation 12


In [26]:
series

1966-01-01     41
1966-02-01     39
1966-03-01     50
1966-04-01     40
1966-05-01     43
1966-06-01     38
1966-07-01     44
1966-08-01     35
1966-09-01     39
1966-10-01     35
1966-11-01     29
1966-12-01     49
1967-01-01     50
1967-02-01     59
1967-03-01     63
1967-04-01     32
1967-05-01     39
1967-06-01     47
1967-07-01     53
1967-08-01     60
1967-09-01     57
1967-10-01     52
1967-11-01     70
1967-12-01     90
1968-01-01     74
1968-02-01     62
1968-03-01     55
1968-04-01     84
1968-05-01     94
1968-06-01     70
             ... 
1972-05-01    212
1972-06-01    246
1972-07-01    353
1972-08-01    339
1972-09-01    308
1972-10-01    247
1972-11-01    257
1972-12-01    322
1973-01-01    298
1973-02-01    273
1973-03-01    312
1973-04-01    249
1973-05-01    286
1973-06-01    279
1973-07-01    309
1973-08-01    401
1973-09-01    309
1973-10-01    328
1973-11-01    353
1973-12-01    354
1974-01-01    327
1974-02-01    324
1974-03-01    285
1974-04-01    243
1974-05-01

The specific contents of these files are:

dataset.csv: Observations from January 1966 to October 1974 (106 observations)

validation.csv: Observations from November 1974 to October 1975 (12 observations)

The validation dataset is 10% of the original dataset.

** Train/Test split **

Firstly, we can split the dataset into train and test sets directly. We’re careful to always convert a loaded dataset to float32 in case the loaded data still has some String or Integer data types.

In [12]:
# load data
series = Series.from_csv(r'E:\MYLEARN\2-ANALYTICS-DataScience\datasets\dataset.csv')

# prepare data
X = series.values
X = X.astype('float32')

train_size = int(len(X) * 0.50)

train, test = X[0:train_size], X[train_size:]
train.shape, test.shape

((53,), (53,))

In [25]:
train

array([ 41.,  39.,  50.,  40.,  43.,  38.,  44.,  35.,  39.,  35.,  29.,
        49.,  50.,  59.,  63.,  32.,  39.,  47.,  53.,  60.,  57.,  52.,
        70.,  90.,  74.,  62.,  55.,  84.,  94.,  70., 108., 139., 120.,
        97., 126., 149., 158., 124., 140., 109., 114.,  77., 120., 133.,
       110.,  92.,  97.,  78.,  99., 107., 112.,  90.,  98.],
      dtype=float32)

In [27]:
test

array([125., 155., 190., 236., 189., 174., 178., 136., 161., 171., 149.,
       184., 155., 276., 224., 213., 279., 268., 287., 238., 213., 257.,
       293., 212., 246., 353., 339., 308., 247., 257., 322., 298., 273.,
       312., 249., 286., 279., 309., 401., 309., 328., 353., 354., 327.,
       324., 285., 243., 241., 287., 355., 460., 364., 487.],
      dtype=float32)

In [24]:
# walk-forward validation
history = [x for x in train]
predictions = list()

for i in range(len(test)):
    # predict
    yhat = history[-1]
    predictions.append(yhat)
    
    # observation
    obs = test[i]
    history.append(obs)
    
    print('>Predicted=%.3f, Expected=%3.f' % (yhat, obs))
    
# report performance
mse  = mean_squared_error(test, predictions)
rmse = sqrt(mse)

print('RMSE: %.3f' % rmse)

>Predicted=98.000, Expected=125
>Predicted=125.000, Expected=155
>Predicted=155.000, Expected=190
>Predicted=190.000, Expected=236
>Predicted=236.000, Expected=189
>Predicted=189.000, Expected=174
>Predicted=174.000, Expected=178
>Predicted=178.000, Expected=136
>Predicted=136.000, Expected=161
>Predicted=161.000, Expected=171
>Predicted=171.000, Expected=149
>Predicted=149.000, Expected=184
>Predicted=184.000, Expected=155
>Predicted=155.000, Expected=276
>Predicted=276.000, Expected=224
>Predicted=224.000, Expected=213
>Predicted=213.000, Expected=279
>Predicted=279.000, Expected=268
>Predicted=268.000, Expected=287
>Predicted=287.000, Expected=238
>Predicted=238.000, Expected=213
>Predicted=213.000, Expected=257
>Predicted=257.000, Expected=293
>Predicted=293.000, Expected=212
>Predicted=212.000, Expected=246
>Predicted=246.000, Expected=353
>Predicted=353.000, Expected=339
>Predicted=339.000, Expected=308
>Predicted=308.000, Expected=247
>Predicted=247.000, Expected=257
>Predicted=

In [None]:
tr