In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mxnet 
from mxnet.gluon import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
finalData = pd.read_csv('../data/ReadyData.csv')


In [3]:
finalData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14592 entries, 0 to 14591
Data columns (total 14 columns):
Unnamed: 0          14592 non-null object
PM2.5               14592 non-null float64
USAQI               14592 non-null float64
CO2                 14592 non-null float64
Temperature         14592 non-null float64
RelativeHumidity    14592 non-null float64
weekday             14592 non-null float64
hour                14592 non-null float64
month               14592 non-null float64
year                14592 non-null float64
dayofmonth          14592 non-null float64
drct                14592 non-null float64
sped                14592 non-null float64
vsby                14592 non-null float64
dtypes: float64(13), object(1)
memory usage: 1.6+ MB


In [4]:
finalData.head()

Unnamed: 0.1,Unnamed: 0,PM2.5,USAQI,CO2,Temperature,RelativeHumidity,weekday,hour,month,year,dayofmonth,drct,sped,vsby
0,2017-01-01 00:00:00,210.5,260.5,438.0,24.5,62.5,6.0,0.0,1.0,2017.0,1.0,10.0,4.6,1.395
1,2017-01-01 01:00:00,198.0,248.0,442.0,24.0,58.0,6.0,1.0,1.0,2017.0,1.0,10.0,5.175,1.395
2,2017-01-01 02:00:00,195.5,245.5,436.5,24.0,54.0,6.0,2.0,1.0,2017.0,1.0,5.0,2.5875,1.0075
3,2017-01-01 03:00:00,144.666667,209.666667,425.666667,22.333333,56.0,6.0,3.0,1.0,2017.0,1.0,0.0,0.0,1.12
4,2017-01-01 04:00:00,133.666667,196.0,416.0,22.333333,53.666667,6.0,4.0,1.0,2017.0,1.0,5.0,2.3,1.1775


In [5]:
lst = ["datetime"] + list(finalData.columns)[1:]
lst

['datetime',
 'PM2.5',
 'USAQI',
 'CO2',
 'Temperature',
 'RelativeHumidity',
 'weekday',
 'hour',
 'month',
 'year',
 'dayofmonth',
 'drct',
 'sped',
 'vsby']

In [6]:
finalData.columns = lst

In [7]:
finalData.head()

Unnamed: 0,datetime,PM2.5,USAQI,CO2,Temperature,RelativeHumidity,weekday,hour,month,year,dayofmonth,drct,sped,vsby
0,2017-01-01 00:00:00,210.5,260.5,438.0,24.5,62.5,6.0,0.0,1.0,2017.0,1.0,10.0,4.6,1.395
1,2017-01-01 01:00:00,198.0,248.0,442.0,24.0,58.0,6.0,1.0,1.0,2017.0,1.0,10.0,5.175,1.395
2,2017-01-01 02:00:00,195.5,245.5,436.5,24.0,54.0,6.0,2.0,1.0,2017.0,1.0,5.0,2.5875,1.0075
3,2017-01-01 03:00:00,144.666667,209.666667,425.666667,22.333333,56.0,6.0,3.0,1.0,2017.0,1.0,0.0,0.0,1.12
4,2017-01-01 04:00:00,133.666667,196.0,416.0,22.333333,53.666667,6.0,4.0,1.0,2017.0,1.0,5.0,2.3,1.1775


In [8]:
len(finalData[finalData.year == 2017].month.unique()) #no of months in 2017
len(finalData[finalData.year == 2018].month.unique()) #no of months in 2018

8

In [9]:
finalData[finalData.year == 2017].info() #no of values in 2017
finalData[finalData.year == 2018].info() #no of values in 2018

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 14 columns):
datetime            8760 non-null object
PM2.5               8760 non-null float64
USAQI               8760 non-null float64
CO2                 8760 non-null float64
Temperature         8760 non-null float64
RelativeHumidity    8760 non-null float64
weekday             8760 non-null float64
hour                8760 non-null float64
month               8760 non-null float64
year                8760 non-null float64
dayofmonth          8760 non-null float64
drct                8760 non-null float64
sped                8760 non-null float64
vsby                8760 non-null float64
dtypes: float64(13), object(1)
memory usage: 1.0+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5832 entries, 8760 to 14591
Data columns (total 14 columns):
datetime            5832 non-null object
PM2.5               5832 non-null float64
USAQI               5832 non-null float64
CO2              

In [10]:
#we have data for a total of 20 months, let's train (roughly) for 16 months, and test for the remaining 4.

trainData = finalData[:30*16*24]
testData = finalData[30*16*24:]
trainData.info()
testData.info()
finalData.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11520 entries, 0 to 11519
Data columns (total 14 columns):
datetime            11520 non-null object
PM2.5               11520 non-null float64
USAQI               11520 non-null float64
CO2                 11520 non-null float64
Temperature         11520 non-null float64
RelativeHumidity    11520 non-null float64
weekday             11520 non-null float64
hour                11520 non-null float64
month               11520 non-null float64
year                11520 non-null float64
dayofmonth          11520 non-null float64
drct                11520 non-null float64
sped                11520 non-null float64
vsby                11520 non-null float64
dtypes: float64(13), object(1)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3072 entries, 11520 to 14591
Data columns (total 14 columns):
datetime            3072 non-null object
PM2.5               3072 non-null float64
USAQI               3072 non-null float64


Unnamed: 0,datetime,PM2.5,USAQI,CO2,Temperature,RelativeHumidity,weekday,hour,month,year,dayofmonth,drct,sped,vsby
14587,2018-08-31 19:00:00,26.571429,79.142857,506.0,26.857143,80.571429,4.0,19.0,8.0,2018.0,31.0,150.0,7.7625,2.8
14588,2018-08-31 20:00:00,26.571429,78.571429,475.428571,26.857143,82.0,4.0,20.0,8.0,2018.0,31.0,201.25,8.76875,2.955
14589,2018-08-31 21:00:00,26.0,76.857143,457.714286,26.857143,82.142857,4.0,21.0,8.0,2018.0,31.0,196.666667,7.283333,3.006667
14590,2018-08-31 22:00:00,25.714286,75.857143,447.0,26.857143,83.285714,4.0,22.0,8.0,2018.0,31.0,174.285714,7.064286,3.021429
14591,2018-08-31 23:00:00,24.142857,72.571429,440.285714,26.857143,83.285714,4.0,23.0,8.0,2018.0,31.0,174.0,8.28,2.986


In [11]:
"""
Train = trainData.shape
tau = 5
features = np.zeros((Train[0]-tau, tau*Train[1]-1))
features = pd.DataFrame(features)
print(features.shape)
for i in range(tau):
    features[:, i*Train[1]:(i+1)*Train[1]] = trainData[i:Train[0]-tau]
print(features)
"""

'\nTrain = trainData.shape\ntau = 5\nfeatures = np.zeros((Train[0]-tau, tau*Train[1]-1))\nfeatures = pd.DataFrame(features)\nprint(features.shape)\nfor i in range(tau):\n    features[:, i*Train[1]:(i+1)*Train[1]] = trainData[i:Train[0]-tau]\nprint(features)\n'

In [12]:
finalData.drop("datetime", axis = 1, inplace = True)

In [13]:
labels = finalData["PM2.5"]
finalData.drop("PM2.5", axis = 1, inplace = True)

In [14]:
mean = finalData.mean()
min = finalData.min()
max = finalData.max()
finalData = finalData.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [15]:
name = list(finalData.columns)
cols, names = list(), list()
for i in range(24, 0, -1): #24 times steps
    cols.append(finalData.shift(i))
    names += [('%s(t-%d)' % (name[j], i)) for j in range(finalData.shape[1])]
cols.append(finalData.shift(-i))
names += [('%s(t)' % (name[j])) for j in range(finalData.shape[1])]
finalDataset = pd.concat(cols, axis=1)
finalDataset.columns = names
finalDataset = finalDataset[24:]
display(finalDataset.head())
#labels = finalDataset["PM2.5(t)"]
#finalDataset.drop("PM2.5(t)", axis = 1, inplace = True)
display(finalDataset)

Unnamed: 0,USAQI(t-24),CO2(t-24),Temperature(t-24),RelativeHumidity(t-24),weekday(t-24),hour(t-24),month(t-24),year(t-24),dayofmonth(t-24),drct(t-24),...,Temperature(t),RelativeHumidity(t),weekday(t),hour(t),month(t),year(t),dayofmonth(t),drct(t),sped(t),vsby(t)
24,0.61849,0.056511,0.348718,0.61236,1.0,0.0,0.0,0.0,0.0,0.027778,...,0.246154,0.445693,0.0,0.043478,0.0,0.0,0.033333,0.5,0.049342,0.302062
25,0.585938,0.061425,0.328205,0.561798,1.0,0.043478,0.0,0.0,0.0,0.027778,...,0.328205,0.516854,1.0,0.086957,0.0,0.0,0.0,0.013889,0.022204,0.093378
26,0.579427,0.054668,0.328205,0.516854,1.0,0.086957,0.0,0.0,0.0,0.013889,...,0.259829,0.539326,1.0,0.130435,0.0,0.0,0.0,0.0,0.0,0.112983
27,0.486111,0.04136,0.259829,0.539326,1.0,0.130435,0.0,0.0,0.0,0.0,...,0.218803,0.453184,0.0,0.173913,0.0,0.0,0.033333,0.0,0.0,0.141737
28,0.450521,0.029484,0.259829,0.513109,1.0,0.173913,0.0,0.0,0.0,0.013889,...,0.218803,0.456929,0.0,0.217391,0.0,0.0,0.033333,0.0,0.0,0.112983


Unnamed: 0,USAQI(t-24),CO2(t-24),Temperature(t-24),RelativeHumidity(t-24),weekday(t-24),hour(t-24),month(t-24),year(t-24),dayofmonth(t-24),drct(t-24),...,Temperature(t),RelativeHumidity(t),weekday(t),hour(t),month(t),year(t),dayofmonth(t),drct(t),sped(t),vsby(t)
24,0.618490,0.056511,0.348718,0.612360,1.000000,0.000000,0.000000,0.0,0.000000,0.027778,...,0.246154,0.445693,0.000000,0.043478,0.000000,0.0,0.033333,0.500000,0.049342,0.302062
25,0.585938,0.061425,0.328205,0.561798,1.000000,0.043478,0.000000,0.0,0.000000,0.027778,...,0.328205,0.516854,1.000000,0.086957,0.000000,0.0,0.000000,0.013889,0.022204,0.093378
26,0.579427,0.054668,0.328205,0.516854,1.000000,0.086957,0.000000,0.0,0.000000,0.013889,...,0.259829,0.539326,1.000000,0.130435,0.000000,0.0,0.000000,0.000000,0.000000,0.112983
27,0.486111,0.041360,0.259829,0.539326,1.000000,0.130435,0.000000,0.0,0.000000,0.000000,...,0.218803,0.453184,0.000000,0.173913,0.000000,0.0,0.033333,0.000000,0.000000,0.141737
28,0.450521,0.029484,0.259829,0.513109,1.000000,0.173913,0.000000,0.0,0.000000,0.013889,...,0.218803,0.456929,0.000000,0.217391,0.000000,0.0,0.033333,0.000000,0.000000,0.112983
29,0.477431,0.026618,0.246154,0.505618,1.000000,0.217391,0.000000,0.0,0.000000,0.064815,...,0.205128,0.460674,0.000000,0.260870,0.000000,0.0,0.033333,0.000000,0.000000,0.080308
30,0.516493,0.025389,0.246154,0.516854,1.000000,0.260870,0.000000,0.0,0.000000,0.022222,...,0.205128,0.460674,0.000000,0.304348,0.000000,0.0,0.033333,0.000000,0.000000,0.112983
31,0.542535,0.025799,0.232479,0.524345,1.000000,0.304348,0.000000,0.0,0.000000,0.092593,...,0.205128,0.486891,0.000000,0.347826,0.000000,0.0,0.033333,0.133333,0.047368,0.115074
32,0.552951,0.033170,0.232479,0.520599,1.000000,0.347826,0.000000,0.0,0.000000,0.038889,...,0.232479,0.490637,0.000000,0.391304,0.000000,0.0,0.033333,0.104167,0.037007,0.166570
33,0.559028,0.036855,0.259829,0.513109,1.000000,0.391304,0.000000,0.0,0.000000,0.379630,...,0.273504,0.483146,0.000000,0.434783,0.000000,0.0,0.033333,0.097222,0.039474,0.251089


In [16]:
finalDataset.reset_index(drop = True)
finalDataset = finalDataset[:-1]
#we have data for a total of 20 months, let's train (roughly) for 16 months, and test for the remaining 4.
trainDataX = finalDataset[:30*16*24]
testDataX = finalDataset[30*16*24:]
trainDataY = labels[:30*16*24]
testDataY = labels[30*16*24:]

#trainData.info()
#testData.info()


In [17]:
finalDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14567 entries, 24 to 14590
Columns: 300 entries, USAQI(t-24) to vsby(t)
dtypes: float64(300)
memory usage: 33.3 MB


In [18]:
from mxnet import nd
finalDataset = nd.array(finalDataset).reshape(-1, 25, 12)
trainDataX = nd.array(trainDataX).reshape(-1, 25, 12)
testDataX = nd.array(testDataX).reshape(-1, 25, 12)

In [19]:
trainDataY = nd.array(trainDataY)
testDataY = nd.array(testDataY)

In [20]:
model = mxnet.gluon.nn.Sequential()
model.add(mxnet.gluon.rnn.LSTM(120, dropout = 0.2))
model.add(mxnet.gluon.nn.Dense(1, activation='tanh'))

model.initialize(mxnet.init.Xavier())

trainer = mxnet.gluon.Trainer(
    params=model.collect_params(),
    optimizer='adam',
    optimizer_params={'learning_rate': 0.001},
)

In [21]:
loss = mxnet.gluon.loss.L2Loss()
error = mxnet.metric.MSE()

In [None]:
epochs = 15
trainingLoss = []
train_iter = mxnet.io.NDArrayIter(trainDataX, trainDataY, 10, shuffle=True)
for epoch in range(epochs):
    for trn_batch in train_iter:
        x = trn_batch.data[0]
        y = trn_batch.label[0]
        
        with mxnet.autograd.record():
            y_pred = model(x)
            im_loss = loss(y_pred, y)
        
        #backprop
        im_loss.backward()
        
        #Optimize!
        trainer.step(batch_size=10)
        
    train_iter.reset()
    
    # Calculate train metrics
    
    predictions = model(trainDataX)
    error.update(trainDataY, predictions)
    trainingLoss.append(error.get()[1])
    error.reset()
    
    train_iter.reset()
    
    print("epoch: {} | trn_loss: {:.8f}".format(epoch+1,
                                                trainingLoss[-1]))

epoch: 1 | trn_loss: 3176.22094727
epoch: 2 | trn_loss: 3176.22094727
epoch: 3 | trn_loss: 3176.22094727
epoch: 4 | trn_loss: 3176.22094727
epoch: 5 | trn_loss: 3176.22094727
