# Grab-Microsoft Challenge 
## Traffic Management

- geohash6: geohash is a public domain geocoding system which encodes a geographic location into a short string of letters and digits with arbitrary precision. You are free to use any geohash library to encode/decode the geohashes into latitude and longitude or vice versa.(Examples:https://github.com/hkwi/python-geohash)
- day: the value indicates the sequential order and not a particular day of the month
- timestamp: start time of 15-minute intervals in the following format: <hour>:<minute>, where hour ranges from 0 to 23 and minute is either one of (0, 15, 30, 45)
- demand: aggregated demand normalised to be in the range [0,1]

### This is train LSTM model 

#### Import python library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plot matplotlib graph
%matplotlib inline

#Import models from scikit learn module:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, LSTM, BatchNormalization
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split

import pickle
from sklearn.externals import joblib


In [21]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

#### Reading from Preprocessed dataset

In [3]:
xls  = pd.ExcelFile('data/Dataset_feature1.xlsx')
data = pd.read_excel(xls, 'TrafficMgmt')
data

Unnamed: 0,day,demand,hour,min,dow,lat,long,geo_labelencoded
0,0.283333,0.020072,0.869565,0.000000,0.666667,0.533333,0.171429,0.161462
1,0.150000,0.024721,0.608696,0.666667,0.500000,0.288889,0.200000,0.071592
2,0.133333,0.102821,0.260870,0.333333,0.333333,0.644444,0.828571,0.795887
3,0.516667,0.088755,0.217391,0.000000,0.666667,0.533333,0.428571,0.530845
4,0.233333,0.074468,0.173913,0.000000,0.166667,0.288889,0.342857,0.348819
...,...,...,...,...,...,...,...,...
1048570,0.083333,0.024022,0.347826,0.666667,1.000000,0.911111,0.771429,0.956588
1048571,0.366667,0.005703,0.652174,0.666667,0.333333,0.111111,0.400000,0.297030
1048572,0.683333,0.067131,0.652174,1.000000,0.000000,0.533333,0.200000,0.178218
1048573,0.016667,0.151323,0.260870,0.666667,0.333333,0.711111,0.600000,0.648134


In [4]:
predictor_var = ['day', 'hour', 'min', 'dow', 'lat', 'long','geo_labelencoded']
outcome_var   = 'demand'

#### Prepare training & test data
- Since this is a time series dataset, we can try using LSTM 
- and create a time series of say 20

In [11]:
data_X = data.drop(outcome_var,axis=1)
y      = data.drop(predictor_var,axis=1)

In [12]:
X = []
for i in range(20):
    X.append(data_X.shift(-1-i).fillna(-1).values)
X = np.array(X)

X = X.reshape(X.shape[1],X.shape[0],X.shape[2])
X.shape

(1048575, 20, 7)

In [13]:
X

array([[[ 0.15      ,  0.60869565,  0.66666667, ...,  0.28888889,
          0.2       ,  0.07159177],
        [ 0.13333333,  0.26086957,  0.33333333, ...,  0.64444444,
          0.82857143,  0.79588728],
        [ 0.51666667,  0.2173913 ,  0.        , ...,  0.53333333,
          0.42857143,  0.53084539],
        ...,
        [ 0.78333333,  0.47826087,  0.66666667, ...,  0.37777778,
          0.05714286,  0.03808073],
        [ 0.26666667,  1.        ,  1.        , ...,  0.77777778,
          0.62857143,  0.65955826],
        [ 0.91666667,  0.43478261,  0.        , ...,  0.28888889,
          0.48571429,  0.36785986]],

       [[ 0.85      ,  0.47826087,  0.        , ...,  1.        ,
          0.82857143,  0.97258187],
        [ 0.93333333,  0.7826087 ,  0.66666667, ...,  0.77777778,
          0.62857143,  0.65955826],
        [ 0.33333333,  0.26086957,  0.        , ...,  0.77777778,
          0.54285714,  0.65346535],
        ...,
        [ 0.8       ,  0.2173913 ,  0.33333333, ...,  

In [14]:
y.shape

(1048575, 1)

In [15]:
X.shape

(1048575, 20, 7)

In [16]:
y.count()

demand    1048575
dtype: int64

#### Train-test-val dataset split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42,shuffle=True)

In [24]:
X_train.shape[1]

20

In [25]:
X_train.shape[2]

7

In [32]:
X_train1 = np.asarray(X_train)
y_train1 = np.asarray(y_train)
X_test1  = np.asarray(X_test)
y_test1  = np.asarray(y_test)

<h2><center>Create Model and test</center><h2>

<h3>LSTM<h3>

In [22]:
model = Sequential()
model.add(LSTM(units=32, return_sequences=True, input_shape=(X_train1.shape[1],X_train1.shape[2])))
model.add(BatchNormalization())
model.add(LSTM(units=32, return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(units=32))
model.add(BatchNormalization())
model.add(Dense(units=1))

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 20, 32)            5120      
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 32)            128       
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 32)            8320      
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 32)            128       
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
dense (Dense)                (None, 1)                 3

In [33]:
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

model.fit(X_train1, y_train1, epochs=5, batch_size=10, verbose=2)

model.save('lstm1_model.h5')

predicted_value = model.predict(X_test1)

print(mean_squared_error(y_test1,predicted_value))

Train on 671088 samples
Epoch 1/5
671088/671088 - 668s - loss: 0.0260 - mse: 0.0260 - mae: 0.0987
Epoch 2/5
671088/671088 - 655s - loss: 0.0251 - mse: 0.0251 - mae: 0.0969
Epoch 3/5
671088/671088 - 654s - loss: 0.0251 - mse: 0.0251 - mae: 0.0969
Epoch 4/5
671088/671088 - 654s - loss: 0.0251 - mse: 0.0251 - mae: 0.0969
Epoch 5/5
671088/671088 - 653s - loss: 0.0251 - mse: 0.0251 - mae: 0.0968
0.025327611485106736


In [34]:
print(mean_squared_error(y_test1,predicted_value))

0.025327611485106736
