In [1]:
import pandas as pd
import numpy as np
from numpy import split, mean
from numpy import concatenate
from math import sqrt
from matplotlib import pyplot as plt
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
# Load the new file
dataset = pd.read_csv('Final_data.csv')

The data starts from 01/03/2013 and ends on 28/02/2017.We will use the first three years of data for training predictive models and the final year for evaluating models. Hence, number of training days is 01/03/2013 - 29/02/2016, equals to 1096 days, i.e. 26304 hr. Number of test days is 01/03/2016 - 28/02/2017, 365 days, equals to 8760 hr.

In [3]:
# Split the dataset into train/test sets

def Split_dataset(data):
    
    # Split into train_data (1096 days X 24 hr) and val_data (219 days X 24 hr), test_data (146 days X 24 hr )
    train, val, test = data[0:-8760], data[-8760:-3504], data[-3504:]


    return train, val, test

In [4]:
# Split into train and test
train, val, test = Split_dataset(dataset.values)

In [5]:
print(train.shape)
print(val.shape)
print(test.shape)

(26304, 22)
(5256, 22)
(3504, 22)


We want to convert the data in time series sequence such that it is a fixed size (24 hrs) moving window. It moves along one time step and predicts the subsequent 1hr.

Input Output                                                                                                                   
[h01, h02, h03, h04, . . . . , h21, h22, h23, h24], [h25]                                                                       
[h02, h03, h04, h05, . . . . ,h22, h23, h24, h25], [h26]

In [6]:
# Convert series to supervised inputs and outputs

def to_supervised(data, n_input=24, n_out=1):
    
    X, y = list(), list()
    in_start = 0

    # step over the entire history one time step at a time
    for _ in range(len(data)):
        # define the end of the input sequence
        in_end = in_start + n_input
        out_end = in_end + n_out

        # ensure we have enough data for this instance
        if out_end <= len(data):
            X.append(data[in_start:in_end, :])
            y.append(data[in_end:out_end, 0])

        # move along one time step
        in_start += 1

    return np.array(X), np.array(y)

In [7]:
# Prepare supervised data
train_X, train_y = to_supervised(train, n_input=24)
val_X, val_y = to_supervised(val, n_input =24)
test_X, test_y = to_supervised(test, n_input =24)

In [8]:
print(train_X.shape)
print(train_y.shape)
print(val_X.shape)
print(val_y.shape)
print(test_X.shape)
print(test_y.shape)

(26280, 24, 22)
(26280, 1)
(5232, 24, 22)
(5232, 1)
(3480, 24, 22)
(3480, 1)


Now let’s create a Baseline Metrics or else we may end up thinking our model works great when in fact it is doing worse than basic models. The simplest approach is to assume that predicted value is the last value of the sequence. This is called naive forecasting, and it is sometimes surprisingly difficult to outperform.

In [9]:
# make a prediction
y_pred = test_X[ : ,-1:, 0]

y_pred

array([[42.],
       [43.],
       [47.],
       ...,
       [13.],
       [16.],
       [21.]])

In [10]:
test_y

array([[43.],
       [47.],
       [45.],
       ...,
       [16.],
       [21.],
       [19.]])

In [12]:
rmse = sqrt(mean_squared_error(test_y, y_pred))
print('Baseline Test RMSE: %.3f' % rmse)

mae = mean_absolute_error(test_y,y_pred)
print('Baseline Test MAE: %.3f' % mae)

R2=r2_score(test_y,y_pred)
print('Baseline Test R2: %.3f' % R2)

Baseline Test RMSE: 23.620
Baseline Test MAE: 12.443
Baseline Test R2: 0.950


Now, we have baseline metrics

Baseline Test RMSE: 23.620                                                                                                     
Baseline Test MAE: 12.443                                                                                                       
Baseline Test R2: 0.950                                                                                                         
