In [None]:
# Let`s import all packages that we may need:

import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.cross_validation import KFold # use for cross validation
from sklearn.preprocessing import StandardScaler # for normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline # pipeline making
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error,r2_score

## for Deep-learing:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import itertools
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout



In [None]:
# getting the data 
df = pd.read_csv('austin_waste_and_diversion.csv', infer_datetime_format=True)

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.describe

In [119]:
df.head(10)

Unnamed: 0,load_time,load_weight
0,2009-05-27 12:55:00,4060.0
2,2012-09-06 13:56:00,5060.0
3,2016-03-14 07:56:00,3720.0
4,2004-12-02 15:29:00,13840.0
5,2007-08-02 15:58:00,2180.0
6,2006-02-15 12:15:00,15720.0
7,2005-07-25 15:00:00,13220.0
8,2008-07-16 00:00:00,3780.0
9,2008-02-29 11:45:00,12140.0
10,2017-03-22 07:40:00,15840.0


In [None]:

## finding all columns that have nan:

droping_list_all=[]
for j in range(0,3):
    if not df.iloc[:, j].notnull().all():
        droping_list_all.append(j)        
        #print(df.iloc[:,j].unique())
droping_list_all

In [121]:

# filling nan with mean in any columns

for j in range(1,2):     
        df.iloc[:,j]=df.iloc[:,j].fillna(df.iloc[:,j].mean())
df.head(10)

Unnamed: 0,load_time,load_weight
0,2009-05-27 12:55:00,4060.0
2,2012-09-06 13:56:00,5060.0
3,2016-03-14 07:56:00,3720.0
4,2004-12-02 15:29:00,13840.0
5,2007-08-02 15:58:00,2180.0


In [None]:

df.isnull().sum()

In [None]:

df.resample('D').sum().plot(title='waste_generation_rate resampled over day for sum') 
#df..resample('D').mean().plot(title='waste generation rate resampled over day', color='red') 
plt.tight_layout()
plt.show()   

df.load_weight.resample('D').mean().plot(title='waste generation rate resampled over day for sum', color='red') 
plt.tight_layout()
plt.show()

In [None]:

### Sum of 'waste generation rates' resampled over month
df['load_weight'].resample('M').mean().plot(kind='bar')
plt.xticks(rotation=60)
plt.ylabel('Waste generation rate')
plt.title('waste generation rate per month')
plt.show()

In [None]:
## Mean of 'waste generation rate' resampled over quarter
df['load_weight'].resample('Q').mean().plot(kind='bar')
plt.xticks(rotation=60)
plt.ylabel('Waste generation rate')
plt.title('Waste generation per quarter (averaged over quarter)')
plt.show()


In [None]:

df['load_weight'].resample('M').mean().plot(kind='bar', color='red')
plt.xticks(rotation=60)
plt.ylabel('Waste generation rate')
plt.title('waste generation rate per month')
plt.show()

In [None]:

df['load_weight'].resample('M').mean().plot(kind='bar', color='brown')
plt.xticks(rotation=60)
plt.ylabel('Waste generation rate')
plt.title('waste generation per month')
plt.show()

In [None]:
# Below I compare the mean of different featuresresampled over day. 
# specify columns to plot
cols = [0, 1, 2]
i = 1
groups=cols
values = df.resample('D').mean().values
# plot each column
plt.figure(figsize=(15, 10))
for group in groups:
	plt.subplot(len(cols), 1, i)
	plt.plot(values[:, group])
	plt.title(df.columns[group], y=0.75, loc='right')
	i += 1
plt.show()

In [None]:

## resampling over week and computing mean
df.load_weight.resample('W').mean().plot(color='y', legend=True)
df.load_weight.resample('W').mean().plot(color='r', legend=True)
plt.show()

In [None]:
# Below I show hist plot of the mean of different feature resampled over month 
df.load_weight.resample('M').mean().plot(kind='hist', color='r', legend=True )

In [None]:
data_returns = df.pct_change()
sns.jointplot(x='Day', y='Waste generation rate', data=data_returns)  

plt.show()

In [None]:
# Correlations among columns
plt.matshow(df.corr(method='spearman'),vmax=1,vmin=-1,cmap='PRGn')
plt.title('without resampling', size=15)
plt.colorbar()
plt.show()

In [None]:

# Correlations of mean of features resampled over months
plt.matshow(df.resample('M').mean().corr(method='spearman'),vmax=1,vmin=-1,cmap='PRGn')
plt.title('resampled over month', size=15)
plt.colorbar()
plt.margins(0.02)
plt.matshow(df.resample('A').mean().corr(method='spearman'),vmax=1,vmin=-1,cmap='PRGn')
plt.title('resampled over year', size=15)
plt.colorbar()
plt.show()


In [None]:
## this will reframe our waste generation time series problem into a supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	dff = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(dff.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(dff.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg
 

In [None]:
## resampling of data over hour
df_resample = df.resample('h').mean() 
df_resample.shape

In [None]:
## * Note: I scale all features in range of [0,1].

## If you would like to train based on the resampled data (over hour), then used below
values = df_resample.values 


## full data without resampling
#values = df.values

# integer encode direction
# ensure all data is float
#values = values.astype('float32')
# normalize features
# always very important to normalise prior to training

# instantiating a MinMax scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# applying our scaler to our raw values to normalise them into a closed range of [0,1]
# fitting and transforming raw values
scaled = scaler.fit_transform(values)
# frame as supervised learning==> labeled
reframed = series_to_supervised(scaled, 1, 1)

# drop columns we don't want to predict
#reframed.drop(reframed.columns[[8,9,10,11,12,13]], axis=1, inplace=True)
print(reframed.head())

In [None]:
## building a neural network architecture.
#  then , train the model on a recurrent LSTM  with dropout

model = Sequential()
model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.2))
#    model.add(LSTM(70))
#    model.add(Dropout(0.3))
model.add(Dense(1)) ## a fully connected layer to cater for the prediction
model.compile(loss='mean_squared_error', optimizer='adam') # finally compile the code , using rmse loss and the adam optimizer



# fit network
history = model.fit(train_X, train_y, epochs=20, batch_size=70, validation_data=(test_X, test_y), verbose=2, shuffle=False)

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], 7))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, -6:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, -6:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE loss. this will be optimized by our adam optimizer
rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:

#Note that in order to improve the model, one has to adjust epochs and batch_size.

## time steps, every step is one hour (you can easily convert the time step to the actual time index)
## for a demonstration purpose, I only compare the predictions in 200 hours. 

aa=[x for x in range(200)]
plt.plot(aa, inv_y[:200], marker='.', label="actual")
plt.plot(aa, inv_yhat[:200], 'r', label="prediction")
plt.ylabel('Waste generation rate', size=15)
plt.xlabel('Time step', size=15)
plt.legend(fontsize=15)
plt.show()
