# Grab-Microsoft Challenge 
## Traffic Management

- geohash6: geohash is a public domain geocoding system which encodes a geographic location into a short string of letters and digits with arbitrary precision. You are free to use any geohash library to encode/decode the geohashes into latitude and longitude or vice versa.(Examples:https://github.com/hkwi/python-geohash)
- day: the value indicates the sequential order and not a particular day of the month
- timestamp: start time of 15-minute intervals in the following format: <hour>:<minute>, where hour ranges from 0 to 23 and minute is either one of (0, 15, 30, 45)
- demand: aggregated demand normalised to be in the range [0,1]

### This is train LSTM model 

#### Import python library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plot matplotlib graph
%matplotlib inline

#Import models from scikit learn module:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, LSTM, BatchNormalization
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split

import h5py
import sklearn.metrics as metrics
from tensorflow.keras.callbacks import ModelCheckpoint,CSVLogger
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import plot_model 

import pickle
import joblib


In [None]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Define Variables

In [None]:
seed = 7
np.random.seed(seed)
modelname   = 'LSTM4'
batch_size = 48
no_of_epoch = 5


### Define Function

In [None]:
def createModel(): 
    inputs  = Input(shape=(X_train1.shape[1],X_train1.shape[2]))
    y = LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.2)(inputs)
    y = BatchNormalization()(y)
    y = LSTM(64, return_sequences=True, dropout=0.5, recurrent_dropout=0.2)(y)
    y = BatchNormalization()(y)
    y = LSTM(64, return_sequences=True, dropout=0.5, recurrent_dropout=0.3)(y)
    y = BatchNormalization()(y)
    y = LSTM(48, return_sequences=True, dropout=0.5, recurrent_dropout=0.4)(y)
    y = BatchNormalization()(y)
    y = LSTM(32, return_sequences=True, dropout=0.5,recurrent_dropout=0.5)(y)
    y = BatchNormalization()(y)
    y = LSTM(16, dropout=0.5,recurrent_dropout=0.5)(y)
    y = BatchNormalization()(y)
    y = Dense(1, activation='sigmoid')(y)
  
    model = Model(inputs=inputs,outputs=y)
    model.compile(loss='mse',optimizer='adam', metrics=['mse', 'mae'])
    return model

#### Reading from Preprocessed dataset

In [None]:
xls  = pd.ExcelFile('data/Dataset_feature1.xlsx')
data = pd.read_excel(xls, 'TrafficMgmt')
data

In [None]:
predictor_var = ['day', 'hour', 'min', 'dow', 'lat', 'long','geo_labelencoded']
outcome_var   = 'demand'

#### Prepare training & test data
- Since this is a time series dataset, we can try using LSTM 
- and create a time series of say 20

In [None]:
data_X = data.drop(outcome_var,axis=1)
y      = data.drop(predictor_var,axis=1)

In [None]:
X = []
for i in range(20):
    X.append(data_X.shift(-1-i).fillna(-1).values)
X = np.array(X)

X = X.reshape(X.shape[1],X.shape[0],X.shape[2])
X.shape

In [None]:
X

In [None]:
y.shape

In [None]:
X.shape

In [None]:
y.count()

#### Train-test-val dataset split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42,shuffle=True)

In [None]:
X_train.shape[1]

In [None]:
X_train.shape[2]

In [None]:
X_train1 = np.asarray(X_train)
y_train1 = np.asarray(y_train)
X_test1  = np.asarray(X_test)
y_test1  = np.asarray(y_test)
X_val1  = np.asarray(X_val)
y_val1  = np.asarray(y_val)

<h2><center>Create Model and test</center><h2>

<h3>LSTM<h3>

In [None]:
model = createModel()
model.summary()

In [None]:
model.compile(loss='mse',optimizer='adam', metrics=['mse', 'mae'])

In [None]:
# Create checkpoints to save model during training and save training data into csv
# ‘monitor’ can be ‘val_acc’ or ‘val_loss’
# When set to ‘val_acc’, ‘mode’ must be ‘max’; when set to ‘val_loss’, ‘mode’ must be ‘min’

filepath       = modelname + ".hdf5"
checkpoint     = ModelCheckpoint(filepath, monitor='val_loss',verbose=0,save_best_only=True,mode='min') 
csv_logger     = CSVLogger(modelname + '.csv')
callbacks_list = [checkpoint,csv_logger]

In [None]:
# The line for training
model.fit(X_train1, 
          y_train1, 
          validation_data=(X_val1, y_val1), 
            epochs=no_of_epoch, 
            batch_size=batch_size,
            shuffle=True,
            callbacks=callbacks_list) 

### validation dataset test result

In [None]:
predicts = model.predict(X_val1)

In [None]:
print(mean_squared_error(y_val1,predicts))

### Test dataset test result

In [None]:
predicted_value = model.predict(X_test1)
print(mean_squared_error(y_test1,predicted_value))

In [None]:
records     = pd.read_csv(modelname +'.csv')
plt.figure()
plt.subplot(211)
plt.plot(records['val_loss'])
plt.plot(records['val_mse'])
plt.yticks([0.00,0.01,0.05,0.1])
plt.title('MSE',fontsize=12)

ax          = plt.gca()
ax.set_xticklabels([])



plt.subplot(212)
plt.plot(records['val_loss'])
plt.plot(records['val_mae'])
plt.yticks([0.00,0.01,0.05,0.1])
plt.title('MAE',fontsize=12)
plt.show()


In [None]:
# Load your own trained model
model = load_model(MODEL_PATH, compile = False)