In [273]:
from math import sqrt
from numpy import concatenate
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


data_set = pd.read_csv('/Users/shenghaoisyummy/Capstone/feature_extraction_data_dummis_all.csv', index_col=0)

In [274]:
list(data_set)

['node_seq_order',
 'arrv_late_time',
 'dprt_late_time',
 'last_node_arrv_late_time',
 'last_node_dprt_late_time',
 'last_2_node_arrv_late_time_sum',
 'last_2_node_dprt_late_time_sum',
 'last_2_node_arrv_late_time_mean',
 'last_2_node_dprt_late_time_mean',
 'last_2_node_arrv_late_time_median',
 'last_2_node_dprt_late_time_median',
 'last_2_node_arrv_late_time_std',
 'last_2_node_dprt_late_time_std',
 'last_2_node_arrv_late_time_max',
 'last_2_node_dprt_late_time_max',
 'last_2_node_arrv_late_time_min',
 'last_2_node_dprt_late_time_min',
 'last_7_node_arrv_late_time_sum',
 'last_7_node_dprt_late_time_sum',
 'last_7_node_arrv_late_time_mean',
 'last_7_node_dprt_late_time_mean',
 'last_7_node_arrv_late_time_median',
 'last_7_node_dprt_late_time_median',
 'last_7_node_arrv_late_time_std',
 'last_7_node_dprt_late_time_std',
 'last_7_node_arrv_late_time_max',
 'last_7_node_dprt_late_time_max',
 'last_7_node_arrv_late_time_min',
 'last_7_node_dprt_late_time_min',
 'last_14_node_arrv_late_time

In [275]:
data_Y = data_set[['actual_station_arrv_time_month','actual_station_arrv_time_days','actual_station_arrv_time_hours','actual_station_arrv_time_minutes']]
data_set.drop(['actual_station_arrv_time_month','actual_station_arrv_time_days','actual_station_arrv_time_hours','actual_station_arrv_time_minutes', 'arrv_late_time', 'dprt_late_time', 'actual_station_dprt_time_month', 'actual_station_dprt_time_days', 'actual_station_dprt_time_hours', 'actual_station_dprt_time_minutes'], axis=1, inplace=True)


In [277]:
data_X = data_set
data_X.shape
list(data_set)

['node_seq_order',
 'last_node_arrv_late_time',
 'last_node_dprt_late_time',
 'last_2_node_arrv_late_time_sum',
 'last_2_node_dprt_late_time_sum',
 'last_2_node_arrv_late_time_mean',
 'last_2_node_dprt_late_time_mean',
 'last_2_node_arrv_late_time_median',
 'last_2_node_dprt_late_time_median',
 'last_2_node_arrv_late_time_std',
 'last_2_node_dprt_late_time_std',
 'last_2_node_arrv_late_time_max',
 'last_2_node_dprt_late_time_max',
 'last_2_node_arrv_late_time_min',
 'last_2_node_dprt_late_time_min',
 'last_7_node_arrv_late_time_sum',
 'last_7_node_dprt_late_time_sum',
 'last_7_node_arrv_late_time_mean',
 'last_7_node_dprt_late_time_mean',
 'last_7_node_arrv_late_time_median',
 'last_7_node_dprt_late_time_median',
 'last_7_node_arrv_late_time_std',
 'last_7_node_dprt_late_time_std',
 'last_7_node_arrv_late_time_max',
 'last_7_node_dprt_late_time_max',
 'last_7_node_arrv_late_time_min',
 'last_7_node_dprt_late_time_min',
 'last_14_node_arrv_late_time_sum',
 'last_14_node_dprt_late_time_s

In [278]:
data_Y.shape

(69197, 4)

In [279]:
# make sure value as float
data_X_values = data_X.values
data_X_values = data_X_values.astype('float32')

data_Y_values = data_Y.values
data_Y_values = data_Y_values.astype('float32')
# normalization
scaler = MinMaxScaler(feature_range=(0, 1))
data_X_values = scaler.fit_transform(data_X_values)
data_Y_values = scaler.fit_transform(data_Y_values)
# split train test
train_X, test_X, train_y, test_y = train_test_split(data_X_values, data_Y_values, test_size = 0.2, random_state = 4)

# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))



In [280]:
model = Sequential()

In [281]:
model.add(LSTM(50,  dropout=0.4, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(4, activation='relu'))
model.add(Dense(4, activation='relu'))

model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_X, train_y, epochs=30, batch_size=1, validation_data=(test_X, test_y), verbose=2, shuffle=False)
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

Train on 55357 samples, validate on 13840 samples
Epoch 1/30
 - 464s - loss: 7.4333 - acc: 0.7157 - val_loss: 7.0647 - val_acc: 0.7295
Epoch 2/30
 - 460s - loss: 7.1069 - acc: 0.7274 - val_loss: 6.9998 - val_acc: 0.7295
Epoch 3/30
 - 464s - loss: 7.0811 - acc: 0.7274 - val_loss: 7.0862 - val_acc: 0.7295
Epoch 4/30
 - 467s - loss: 7.0898 - acc: 0.7274 - val_loss: 7.0381 - val_acc: 0.7295
Epoch 5/30
 - 455s - loss: 7.0896 - acc: 0.7274 - val_loss: 6.9835 - val_acc: 0.7295
Epoch 6/30
 - 456s - loss: 7.0676 - acc: 0.7274 - val_loss: 6.9390 - val_acc: 0.7295
Epoch 7/30
 - 463s - loss: 7.0455 - acc: 0.7274 - val_loss: 6.9069 - val_acc: 0.7295
Epoch 8/30
 - 467s - loss: 7.0313 - acc: 0.7274 - val_loss: 6.9286 - val_acc: 0.7295
Epoch 9/30
 - 525s - loss: 7.0341 - acc: 0.7274 - val_loss: 7.0092 - val_acc: 0.7295
Epoch 10/30
 - 827s - loss: 7.0312 - acc: 0.7274 - val_loss: 6.8785 - val_acc: 0.7295
Epoch 11/30


KeyboardInterrupt: 