# Import

In [None]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from keras.layers import BatchNormalization

%matplotlib inline
#%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)


## load dataset

In [None]:
dataset = pd.read_csv('eMalahleniIMSO2.csv', sep =';', header=0, index_col=0)
values = dataset.values

## Plot pm2.5

In [None]:
plt.plot(values[:,0])
plt.ylabel(dataset.columns[0])
plt.show()

## Data preparation

We need a way to prepare the data for whatever way we would like to formulate the problem.

In this case we are formulating it such that we take in 1 time step input (14 variables) and output 1 time step output (1 variable). In other words we are trying to solve the following question: given the pollution and weather conditions of the previous hour, can we predict the PM2.5 level for the next hour.

The single variable we are outputing is the PM2.5 level. Note we also use PM2.5 level in our input.

Credit for this code: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

## Get column names

In [None]:
dataset.columns

##Actually perform the data preparation

We scale the values between 0 and 1.

The code which converts the data into the suitable way we want, in this case, will produce 14 output variables. In our case we only want to predict PM2.5, that is why we drop the other collumns from the dataframe.

Credit for this code: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/

In [None]:
# ensure all data is float
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)

# drop columns we don't want to predict
# We drop these because we are only interested in predicting for a single variable (pollution).
# If we don't drop, then we will be predicting for all the variables too!
reframed.drop(reframed.columns[[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]], axis=1, inplace=True)
values = reframed.values

## View the data

In [None]:
reframed.head()

## Create X and Y variables

In [None]:
values.shape

In [None]:
X = values[:,:-1]

In [None]:
Y = values[:,-1]

## Check the shapes

In [None]:
X.shape

In [None]:
Y.shape

## Reshaping

The format that Keras expects is [batches, timesteps, features]

In [None]:
X = X.reshape(X.shape[0],1,X.shape[1])

In [None]:
X.shape

## Training, validation and testing split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.20, random_state=42)

## Check the shape

In [None]:
print ('X_train:',X_train.shape)
print ('Y_train:',Y_train.shape)
print ()
print ('X_val:',X_val.shape)
print ('Y_val:',Y_val.shape)
print ()
print ('X_test:',X_test.shape)
print ('Y_test:',Y_test.shape)

## Define a model

In [None]:
model = Sequential()
model.add(LSTM(56, input_shape=(1, 14)))
model.add(Dense(56, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(56, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(56, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(56, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss='mse', optimizer='adam')

## Print summary

In [None]:
model.summary()

## Training

In [None]:
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=60, batch_size=32, verbose=1)

## Plot the performance

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylabel('LOSS', fontname="Times New Roman", size=20,fontweight="bold")
plt.xlabel('Epochs', fontname="Times New Roman", size=20,fontweight="bold")
plt.title('Loss over epochs', fontname="Times New Roman", size=28,fontweight="bold")
legend_properties = {'weight':'bold'}
plt.legend()
plt.show()

## Predict

In [None]:
prediction = model.predict(X_test)

In [None]:
mean_absolute_error(Y_test, prediction)

In [None]:
mean_squared_error(Y_test, prediction)

In [None]:
rmse = sqrt(mean_squared_error(Y_test, prediction))
print(rmse)

In [None]:
r2_score(Y_test, prediction)

In [None]:
print('accuracy testing = {}'.format(np.sum(prediction==Y_test)))

## Compare prediction and testing data

In [None]:
plt.plot(Y_test[0:480], color='blue', label = 'Observed')
plt.plot(prediction[0:480], color='red', label = 'Predicted')
plt.ylabel('PM10', fontname="Times New Roman", size=20,fontweight="bold")
plt.xlabel('Time(Hrs)', fontname="Times New Roman", size=20,fontweight="bold")
plt.title('eMalahleni BNLSTM', fontname="Times New Roman", size=28,fontweight="bold")
legend_properties = {'weight':'bold'}
plt.legend(prop=legend_properties)
plt.show()

In [None]:
mean_absolute_error(Y_test[1:6], prediction[1:6])

In [None]:
mean_absolute_error(Y_test[1:12], prediction[1:12])

In [None]:
mean_absolute_error(Y_test[1:18], prediction[1:18])

In [None]:
mean_absolute_error(Y_test[1:24], prediction[1:24])

In [None]:
mean_absolute_error(Y_test[1:36], prediction[1:36])

In [None]:
mean_absolute_error(Y_test[1:48], prediction[1:48])

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:6], prediction[1:6]))
print(rmse)

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:12], prediction[1:12]))
print(rmse)

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:18], prediction[1:18]))
print(rmse)

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:24], prediction[1:24]))
print(rmse)

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:36], prediction[1:36]))
print(rmse)

In [None]:
rmse = sqrt(mean_squared_error(Y_test[1:48], prediction[1:48]))
print(rmse)