# This Notebook is mainly for exploring the data and analysing Model performance

In [None]:
from matplotlib.pyplot import *
import numpy as np
import pandas as pd
import json
from keras.models import model_from_json

%matplotlib inline

In [None]:
# We do some simple Data Exploration here, since the data looks like it is from Grab's Data Warehouse 
# and there is not much cleaning to be done, only transformation into input matrix for the LSTM Model

In [None]:
df = pd.read_csv("./training.csv")
df.head(1)

In [None]:
df.info()

In [None]:
#Checking how many data entries in total
df.shape

In [None]:
# Checking for null values in the data
df.isnull().values.any()

In [None]:
#Checking the day range
df.sort_values(by='day',ascending=True)

In [None]:
#Checking the data points for each of the geohashes
df.geohash6.value_counts()

In [None]:
#Checking the demand range
df.sort_values(by='demand',ascending=True)

In [None]:
## We can see that the all the target demand values are already normalised between 0 and 1
## There is no null values in the dataset
## The given data is from day 1 until day 61
## Data transformation into model input is done in the other Jupyter Notebook
## The following section is used to analyse the model during training and troubleshooting

# Checking the training and validation error on the top 20 models
# to ensure that all the models are training properly

In [None]:
#Loop through the sorted models
file = open("./All_Trained_Models/Result_Store_Sorted.json", 'r')
sorted_model = json.loads(file.read())

fig = figure()
fig.set_figheight(15)
fig.set_figwidth(15)

k = 1
#Checking only the top 20 models
for i in range(20):
    i = i + 1
    s = sorted_model[str(i)]['Model']    
    history_dict = json.load(open("./All_Trained_Models/{}_model_history.json".format(s), 'r'))
    a = history_dict['loss']
    b = history_dict['val_loss']

    subplot(5,4,k)    
    title('({})'.format(s))
    k+=1
    plot(a)
    plot(b)

# Checking the model output in 2D

In [None]:
def get_rmse(T5_actual,T5_pred): 
    return np.sqrt(np.mean((T5_actual-T5_pred)**2))

In [None]:
input_matrix = np.load('./test_matrix.npy')

T_shift = 0         # Amount to shift the time-series input data
Day_Feed = 13       # Predict T+5 from x days
day_feed = Day_Feed*24*4

#Get the sorted model info
file = open("./All_Trained_Models/Result_Store_Sorted.json", 'r')
sorted_model = json.loads(file.read())

#Checking the best model
s = sorted_model[str(1)]['Model']

# load json and create model
json_file = open("./All_Trained_Models/{}_model.json".format(s), 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(("./All_Trained_Models/{}_model.h5".format(s)))


input_set = input_matrix[:, T_shift:T_shift+day_feed,:,:,:]
target_set = input_matrix[:, T_shift:T_shift + day_feed + 5,:,:,:]
pred_out = input_set[0][:,:,:,:]

# Make predictions using the model
for i in range(5):
    new_pred = loaded_model.predict(pred_out[np.newaxis, ::, ::, ::, ::])
    new = new_pred[::, -1, ::, ::, ::]
    pred_out = np.concatenate((pred_out, new), axis=0)

T5_pred = pred_out[-5:,:,:,:]                    
T5_actual = target_set[0][-5:,:,:,:]  

# Plot the prediction output and show the target output
fig, axs = subplots(nrows=5, ncols=2, figsize=(15, 30),
    subplot_kw={'xlabel': 'lattitude', 'ylabel': 'longitude'})

for ax, count, index in zip(axs.flat,range(10),range(10)):

    index = int(index/2)

    T5_actual_plot = np.reshape(T5_actual[index,:,:,:],(36,46))
    T5_pred_plot = np.reshape(T5_pred[index,:,:,:],(36,46))

    if count%2 == 0:
        ax.imshow(T5_pred_plot)
        ax.set_title('Prediction (T+{})'.format(index+1))
    else:
        ax.imshow(T5_actual_plot)
        ax.set_title('Actual (T+{})'.format(index+1))

print('Showing Preditions vs Actual for Model {}'.format(s))
# tight_layout()
# show()

rmse = get_rmse(T5_actual,T5_pred)

print('RMSE for {} model is {}'.format(s,rmse))        

# Checking the numerical value of the model output

In [None]:
# The last 5 values are always prediction values from the model
T_plus_check = 15 # check end of how many timesteps 
df_test = pd.read_csv("./test.csv")
coords_df = pd.read_csv("./coords_mapping.csv")

df_out = df_test

# Removing time series and keep only the unique geohashes
df_out = pd.DataFrame({'geohash6' : df_out['geohash6'].unique()})
# merge on test dataset geohash with coords_df, drop if no geohash match
df_out = pd.merge(df_out, coords_df, how = 'left', on = ['geohash6'])

#Prediction for T+0 to T_plus_check
for i in range(T_plus_check):

    Matrix = np.reshape(pred_out[i,:,:,:],(36,46))  
    print('Mapping Prediction Output into DataFrame: {} of {}.'.format(i,T_plus_check), end='\r')
    df_out['T+{}'.format(i)] = pd.Series()
    long_group = df_out.groupby('long_index')
    for long_val, long_dfs in long_group:
        lat_group = long_dfs.groupby('lat_index')         
        for lat_val, lat_dfs in lat_group:
            index = lat_dfs.index.tolist()
            df_out.set_value(index, 'T+{}'.format(i),Matrix[long_val,lat_val])
df_out = df_out.drop(columns = ['lat','long','lat_err','long_err','lat_index','long_index',])
df_out.head(40)