# Grab-Microsoft Challenge 
## Traffic Management

- geohash6: geohash is a public domain geocoding system which encodes a geographic location into a short string of letters and digits with arbitrary precision. You are free to use any geohash library to encode/decode the geohashes into latitude and longitude or vice versa.(Examples:https://github.com/hkwi/python-geohash)
- day: the value indicates the sequential order and not a particular day of the month
- timestamp: start time of 15-minute intervals in the following format: <hour>:<minute>, where hour ranges from 0 to 23 and minute is either one of (0, 15, 30, 45)
- demand: aggregated demand normalised to be in the range [0,1]

### This is train LSTM model 

#### Import python library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plot matplotlib graph
%matplotlib inline

#Import models from scikit learn module:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, LSTM, BatchNormalization
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split

import h5py
import sklearn.metrics as metrics
from tensorflow.keras.callbacks import ModelCheckpoint,CSVLogger
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import plot_model 

import pickle
import joblib


In [2]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Define Variables

In [27]:
seed = 7
np.random.seed(seed)
modelname   = 'LSTM2'
batch_size  = 32768
no_of_epoch = 10
optmz       = 'adam'

### Define Function

In [8]:
def createModel(): 
    inputs  = Input(shape=(X_train1.shape[1],X_train1.shape[2]))
    y = LSTM(units=32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(inputs)
    y = BatchNormalization()(y)
    y = LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.2)(y)
    y = BatchNormalization()(y)
    y = LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)(y)
    y = BatchNormalization()(y)
    y = LSTM(64, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)(y)
    y = BatchNormalization()(y)
    y = LSTM(48, return_sequences=True, dropout=0.5,recurrent_dropout=0.5)(y)
    y = BatchNormalization()(y)
    y = LSTM(32, return_sequences=True, dropout=0.5,recurrent_dropout=0.5)(y)
    y = BatchNormalization()(y)
    y = Dense(1, activation='sigmoid')(y)
  
    model = Model(inputs=inputs,outputs=y)
    model.compile(loss='mse',optimizer='adam', metrics=['mse', 'mae'])
    return model

#### Reading from Preprocessed dataset

In [5]:
xls  = pd.ExcelFile('data/Dataset_feature1.xlsx')
data = pd.read_excel(xls, 'TrafficMgmt')
data

Unnamed: 0,day,demand,hour,min,dow,lat,long,geo_labelencoded
0,0.283333,0.020072,0.869565,0.000000,0.666667,0.533333,0.171429,0.161462
1,0.150000,0.024721,0.608696,0.666667,0.500000,0.288889,0.200000,0.071592
2,0.133333,0.102821,0.260870,0.333333,0.333333,0.644444,0.828571,0.795887
3,0.516667,0.088755,0.217391,0.000000,0.666667,0.533333,0.428571,0.530845
4,0.233333,0.074468,0.173913,0.000000,0.166667,0.288889,0.342857,0.348819
...,...,...,...,...,...,...,...,...
1048570,0.083333,0.024022,0.347826,0.666667,1.000000,0.911111,0.771429,0.956588
1048571,0.366667,0.005703,0.652174,0.666667,0.333333,0.111111,0.400000,0.297030
1048572,0.683333,0.067131,0.652174,1.000000,0.000000,0.533333,0.200000,0.178218
1048573,0.016667,0.151323,0.260870,0.666667,0.333333,0.711111,0.600000,0.648134


In [6]:
predictor_var = ['day', 'hour', 'min', 'dow', 'lat', 'long','geo_labelencoded']
outcome_var   = 'demand'

#### Prepare training & test data
- Since this is a time series dataset, we can try using LSTM 
- and create a time series of say 20

In [7]:
data_X = data.drop(outcome_var,axis=1)
y      = data.drop(predictor_var,axis=1)

In [9]:
X = []
for i in range(20):
    X.append(data_X.shift(-1-i).fillna(-1).values)
X = np.array(X)

X = X.reshape(X.shape[1],X.shape[0],X.shape[2])
X.shape

(1048575, 20, 7)

In [10]:
X

array([[[ 0.15      ,  0.60869565,  0.66666667, ...,  0.28888889,
          0.2       ,  0.07159177],
        [ 0.13333333,  0.26086957,  0.33333333, ...,  0.64444444,
          0.82857143,  0.79588728],
        [ 0.51666667,  0.2173913 ,  0.        , ...,  0.53333333,
          0.42857143,  0.53084539],
        ...,
        [ 0.78333333,  0.47826087,  0.66666667, ...,  0.37777778,
          0.05714286,  0.03808073],
        [ 0.26666667,  1.        ,  1.        , ...,  0.77777778,
          0.62857143,  0.65955826],
        [ 0.91666667,  0.43478261,  0.        , ...,  0.28888889,
          0.48571429,  0.36785986]],

       [[ 0.85      ,  0.47826087,  0.        , ...,  1.        ,
          0.82857143,  0.97258187],
        [ 0.93333333,  0.7826087 ,  0.66666667, ...,  0.77777778,
          0.62857143,  0.65955826],
        [ 0.33333333,  0.26086957,  0.        , ...,  0.77777778,
          0.54285714,  0.65346535],
        ...,
        [ 0.8       ,  0.2173913 ,  0.33333333, ...,  

In [11]:
y.shape

(1048575, 1)

In [12]:
X.shape

(1048575, 20, 7)

#### Train-test-val dataset split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42,shuffle=True)

In [14]:
X_train.shape[1]

20

In [15]:
X_train.shape[2]

7

In [16]:
X_train1 = np.asarray(X_train)
y_train1 = np.asarray(y_train)
X_test1  = np.asarray(X_test)
y_test1  = np.asarray(y_test)
X_val1  = np.asarray(X_val)
y_val1  = np.asarray(y_val)

In [17]:
X_train1.shape

(671088, 20, 7)

In [18]:
y_train1.shape

(671088, 1)

In [19]:
X_val1.shape

(167772, 20, 7)

In [20]:
y_val1.shape

(167772, 1)

<h2><center>Create Model and test</center><h2>

<h3>LSTM<h3>

In [21]:
model = createModel()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20, 7)]           0         
_________________________________________________________________
lstm (LSTM)                  (None, 20, 32)            5120      
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 32)            128       
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 128)           82432     
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 128)           512       
_________________________________________________________________
lstm_2 (LSTM)                (None, 20, 128)           131584    
_________________________________________________________________
batch_normalization_2 (Batch (None, 20, 128)           512   

In [22]:
model.compile(loss='mse',optimizer='adam', metrics=['mse', 'mae'])

In [23]:
# Create checkpoints to save model during training and save training data into csv
# ‘monitor’ can be ‘val_acc’ or ‘val_loss’
# When set to ‘val_acc’, ‘mode’ must be ‘max’; when set to ‘val_loss’, ‘mode’ must be ‘min’

filepath       = modelname + ".hdf5"
checkpoint     = ModelCheckpoint(filepath, monitor='val_loss',verbose=0,save_best_only=True,mode='min') 
csv_logger     = CSVLogger(modelname + '.csv')
callbacks_list = [checkpoint,csv_logger]

In [26]:
# The line for training
history = model.fit(X_train1, 
                    y_train1, 
                    validation_data=(X_val1, y_val1), 
                    epochs=no_of_epoch, 
                    batch_size=batch_size,
                    shuffle=True,
                    callbacks=callbacks_list) 

ValueError: A target array with shape (671088, 1) was passed for an output of shape (None, 20, 1) while using as loss `mean_squared_error`. This loss expects targets to have the same shape as the output.

### validation dataset test result

In [None]:
predicts = model.predict(X_val1)
print(mean_squared_error(y_val1,predicts))

### Test dataset test result

In [None]:
predicted_value = model.predict(X_test1)
print(mean_squared_error(y_test1,predicted_value))

In [None]:
records     = pd.read_csv(modelname +'.csv')
plt.figure(figsize=(15,10))
plt.subplot(211)
plt.plot(records['val_mse'],label="mse_loss")
plt.plot(records['val_loss'],label="val_loss")
plt.plot(records['loss'],label="loss")
plt.title('MSE',fontsize=12)
plt.legend(loc="upper left",fontsize=15)

ax          = plt.gca()
ax.set_xticklabels([])

plt.subplot(212)
plt.plot(records['val_mae'],label="val_mae")
plt.plot(records['val_loss'],label="val_loss")
plt.plot(records['loss'],label="loss")
plt.title('MAE',fontsize=12)
plt.legend(loc="upper left",fontsize=15)

plt.show()


In [None]:
# evaluate the model
train_mse = model.evaluate(X_train1, y_train, verbose=0)
test_mse = model.evaluate(X_test1, y_test, verbose=0)

In [None]:
train_mse

In [None]:
test_mse

In [None]:
# plot loss during training
plt.title('Loss / Mean Squared Error')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

### Testing using 1 row of inference data

In [None]:
# Data input sequence format :
test_data1   = [[18,20, 0, 4, -5.353088, 90.653687]]
test_data1   = pd.DataFrame(test_data1, columns = predictor_var)
test_target1 = [0.020072]

test_data2   = [[10,14,30, 3, -5.413513, 90.664673]]
test_data2   = pd.DataFrame(test_data2, columns = predictor_var)
test_target2 = [0.024721] 

test_data3   = [[ 9, 6,15, 2, -5.325623, 90.906372]]
test_data3   = pd.DataFrame(test_data3, columns = predictor_var)
test_target3 = [0.102821]

test_data4   = [[32, 5, 0, 4, -5.353088, 90.752563]]
test_data4   = pd.DataFrame(test_data4, columns = predictor_var)
test_target4 = [0.088755]

test_data5   = [[15, 4, 0, 1, -5.413513, 90.719604]]
test_data5   = pd.DataFrame(test_data5, columns = predictor_var)
test_target5 = [0.074468]

In [None]:
test_data1.shape

In [None]:
scaler_filename = "grab_msft_scaler.save"
scaler          = joblib.load(scaler_filename) 
Data_scaled = scaler.transform(test_data)

In [None]:
Data_scaled

In [None]:
Data_scaled = Data_scaled.reshape(1,1,7)

In [None]:
Data_scaled[0]

In [None]:
X_shift = np.repeat(Data_scaled[:, :, np.newaxis], 20, axis=2).reshape(1,20,7)
X_shift.shape

In [None]:
predicted_value = model.predict(X_shift)
print('Predicted value : {}\nActual Value    : {}' .format(predicted_value[0][0],test_target[0]))

In [None]:
print(mean_squared_error(test_target,predicted_value))