# Grab-Microsoft Challenge 
## Traffic Management

- geohash6: geohash is a public domain geocoding system which encodes a geographic location into a short string of letters and digits with arbitrary precision. You are free to use any geohash library to encode/decode the geohashes into latitude and longitude or vice versa.(Examples:https://github.com/hkwi/python-geohash)
- day: the value indicates the sequential order and not a particular day of the month
- timestamp: start time of 15-minute intervals in the following format: <hour>:<minute>, where hour ranges from 0 to 23 and minute is either one of (0, 15, 30, 45)
- demand: aggregated demand normalised to be in the range [0,1]
    
## Problem Statements:
- Which areas have high / low traffic demand?
- How does regional traffic demand change according to day / time?
- Forecast the travel demand for next 15min / 1hour and predict areas with high travel demand

### This is for model training and testing

#### Import python library

In [1]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt

# plot matplotlib graph
%matplotlib inline

#Import models from scikit learn module:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

import h5py
import sklearn.metrics as metrics
from tensorflow.keras.callbacks import ModelCheckpoint,CSVLogger

import pickle
import joblib
import os

from ModelDefinitions import createModel

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
    #***********************************
    # Index 00-10 is for LSTM Model
    # Index 11-20 is for Bi-LSTM Model
    # Index 21-30 is for CNN Model
    # Index 31-40 is for CNN-LSTM Model
    #***********************************

# choose a number of time steps
n_steps_in, predict_next_no_of_output = 3, 2

# ------ CHANGE THESE ------
index = 32
seed  = 7
np.random.seed(seed)
basemodelname = 'CNN-LSTM2-' + str(n_steps_in) + 'in' + str(predict_next_no_of_output) + 'out'
batch_size    = 4096
no_of_epoch   = 30
no_of_train   = 0.8
# --------------------------

modelname = 'model/' + basemodelname + "_" + str(index)
filepath  = modelname + ".hdf5"

## Define Functions

In [4]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)


## Read in Dataset

In [None]:
xls  = pd.ExcelFile('data/Dataset_Sorted_feature.xlsx')
data = pd.read_excel(xls, 'TrafficMgmt')
data

In [None]:
predictor_var = ['day','hour','min','dow','lat','long']
outcome_var   = 'demand'
no_of_features = len(predictor_var)

#### Prepare training & test data

In [None]:
# convert into input/output
dataset = data.to_numpy()
X, y = split_sequences(dataset, n_steps_in, predict_next_no_of_output)
    
# Train-test-val dataset split
X_train1 = X[:int(no_of_train*len(X))]
y_train1 = y[:int(no_of_train*len(X))]
X_test   = X[int(no_of_train*len(X)):]
y_test   = y[int(no_of_train*len(X)):]

X_train = X_train1[:int(no_of_train*len(X_train1))]
y_train = y_train1[:int(no_of_train*len(X_train1))]
X_val   = X_train1[int(no_of_train*len(X_train1)):]
y_val   = y_train1[int(no_of_train*len(X_train1)):]


In [None]:
print(X_train.shape, y_train.shape,X_test.shape, y_test.shape,X_val.shape, y_val.shape)

### Main Program

In [None]:
def main():
    
    #***********************************
    # Index 00-10 is for LSTM Model
    # Index 11-20 is for Bi-LSTM Model
    # Index 21-30 is for CNN Model
    # Index 31-40 is for CNN-LSTM Model
    #***********************************

    # Create model and summary
    model = createModel(X_train, predict_next_no_of_output, index) 
    model.summary()
    
    # Create checkpoint for the training
    # This checkpoint performs model saving when
    # an epoch gives highest testing accuracy  
    checkpoint     = ModelCheckpoint(filepath, monitor='val_loss',verbose=0,save_best_only=True,mode='min') 
    
    # Log the epoch detail into csv
    csv_logger     = CSVLogger(modelname + '.csv')
    callbacks_list = [checkpoint,csv_logger]

    # steps_per_epoch = total training data across all classes / batch size
    # validation_steps = number of batches in validation dataset defining 1 epoch
    # The line for training
    history = model.fit(X_train, 
                        y_train, 
                        validation_data=(X_val, y_val), 
                        epochs=no_of_epoch, 
                        batch_size=batch_size,
                        shuffle=False,
                        callbacks=callbacks_list) 

main()

### Load saved trained model and scalar

In [None]:
# Load your own trained model & scaler
model           = load_model(filepath, compile = False)
scaler_filename = "grab_msft_scaler.save"
scaler          = joblib.load(scaler_filename) 

### Test dataset test result

In [None]:
predicted_value = model.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,predicted_value)))

In [None]:
predicted_value = model.predict(X_val)
print(np.sqrt(mean_squared_error(y_val,predicted_value)))

In [None]:
predicted_value = model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,predicted_value)))

In [None]:
records     = pd.read_csv(modelname +'.csv')
plt.figure(figsize=(15,10))

plt.xticks(records['epoch'])

plt.plot(records['mse'],label="mse")
plt.plot(records['val_mse'],label="val_mse")
plt.title('MSE-MAE',fontsize=12)
plt.legend(loc="upper left",fontsize=15)

plt.show()

### Sample Testing 

In [None]:
xls  = pd.ExcelFile('data/test_sorted_sample8000.xlsx')
test_sample = pd.read_excel(xls, 'TrafficMgmt')

# Apply Standardscalar to sample test data
test_sample[predictor_var] = scaler.fit_transform(test_sample[predictor_var])

In [None]:
# covert into input/output
test_sample_array = test_sample.to_numpy()
X_sample, y_sample = split_sequences(test_sample_array, n_steps_in, predict_next_no_of_output)
print(X_sample.shape, y_sample.shape)

In [None]:
predicted_value = model.predict(X_sample)
print('Predicted value : {} \n Actual Value    : {}' .format(predicted_value,y_sample))

In [None]:
print(np.sqrt(mean_squared_error(y_sample,predicted_value)))