In [None]:
import numpy as np
import pandas as pd
import json
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization

# Running this code requires the following files and folders:
#     1. "./test.csv"
#     2. "./coords_mapping.csv"
#     3. "./All_Trained_Models/Result_Store_Sorted.json"
#     4. "./Top_20_Models/"     *with the top 20 models inside this folder

In [None]:
#load the test dataset and the coordinates mapping file
df_test = pd.read_csv("./test.csv")
coords_df = pd.read_csv("./coords_mapping.csv")

In [None]:
#Pre-process the data into time-series data first
#Define timestamp to be have one day cycle, ie 24hrs x 4 (15 min buckets) = 96
df = df_test

# Reformat timestamp
ts_df = pd.DataFrame({'timestamp' : df['timestamp'].unique()})
ts_df[['h','m']]=ts_df['timestamp'].apply(lambda cell: pd.Series(cell.split(":")))
ts_df['h'] = ts_df['h'].apply(lambda cell:int(cell)*4)
ts_df['m'] = ts_df['m'].apply(lambda cell:int(cell)/15)
ts_df['new_ts']= ts_df['h'] + ts_df['m']
df = df.merge(ts_df.drop(['h','m'],axis=1), on ='timestamp', how='inner')

# We apply coords_df from csv to ensure the geohash to input matrix mapping is correct
df = df.merge(coords_df.drop(columns=['lat','long','lat_err','long_err']), on='geohash6', how='left')
df = df.drop(columns = ['geohash6','timestamp'])

# Merge 'day' with 'timestamp' to generate a continous time-series
df['time-series'] = df['day'].apply(lambda cell: (cell-1)*24*4) + df['new_ts']
df['time-series'] = df['time-series'] - df['time-series'].min()
df['time-series'] = df['time-series'].astype(int)

# Drop unnecessary columns and sort by time and longitude
df = df.drop(columns = ['day','new_ts'])
df = df.sort_values(by=['time-series','long_index'],ascending=True)

In [None]:
# Convert the data into matrix of shape [n_samples, time-series, longtitude, lattitude, depth] to feed into ConvLSTM for training
n_samples = 1
n_frames = df['time-series'].max().astype(int) + 1
long = df['long_index'].max().astype(int) + 1
lat = df['lat_index'].max().astype(int) + 1

input_matrix = np.zeros((n_samples, n_frames, long, lat, 1), dtype=np.float)

# Create an array of zeros to fill in Dataframe where there are missing data points
a = np.zeros((46,4), dtype=np.int)
a[:, 1] =  np.linspace(0, 45, 46,dtype=int)
df_zeros = pd.DataFrame({'demand':a[:,0],'lat_index':a[:,1],'long_index':a[:,2],'time-series':a[:,3]})

#time_val here is the values from 'time-series', also the index number for the time   
time_grouped = df.groupby('time-series')
for time_val, time_dfs in time_grouped:    
    long_group = time_dfs.groupby('long_index')
    print('Mapping Data into 2D: {} of {}.'.format(time_val+1,n_frames), end='\r')    
    #long_val here is the values from 'long_index', also the index number for the row
    for long_val, long_dfs in long_group:        
        long_dfs = long_dfs.append(df_zeros)
        long_dfs = long_dfs.groupby('lat_index').max().reset_index()
        b = long_dfs['demand'].values        
        c = np.matrix(b.tolist())
        input_matrix[n_samples-1,time_val,long_val,:,0]=c.reshape(46)
        
# Prepare the input and output matrix set for training
input_set = input_matrix[:,:-1,:,:,:]
target_set = input_matrix[:,1:,:,:,:]

# Evaluate the RMSE of all the Top 20 models on Test Data rigorously

In [None]:
def get_rmse(T5_actual,T5_pred): 
    return np.sqrt(np.mean((T5_actual-T5_pred)**2))

In [None]:
file = open("./All_Trained_Models/Result_Store_Sorted.json", 'r')
sorted_model = json.loads(file.read())

T_shift = [0,48,96,144,187]   # 0 - 187, shifts the test data set
Day_Feed = [1,3,7,10,12] #Predict T+5 from x days
Overall_min = 1

Result_Store = {}

for i in range(20):
    i = i + 1
    s = sorted_model[str(i)]['Model']
    
    #extract the 'day' value from the file title
    day = float(s[s.find('layer')+len('layer'):s.rfind('day')])
    
    # load json and create model
    json_file = open("./Top_20_Models/{}_model.json".format(sorted_model[str(i)]['Model']), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    
    # load weights into model
    loaded_model.load_weights(("./Top_20_Models/{}_model.h5".format(sorted_model[str(i)]['Model'])))
    # print("Loaded model from disk")  
    
    RMSE_Day_total = 0
    RMSE_Day_total_baseline = 0 # for baseline RMSE----------------
    Max_Day_rmse = 0
    Min_Day_rmse = 1

    Day_all_store = {}
    for day in Day_Feed:
        RMSE_T_shift_total = 0
        Max_T_shift_rmse = 0
        Min_T_shift_rmse = 1
        day_feed = day*24*4

        for a in T_shift:
            input_set = input_matrix[:, a : a + day_feed,:,:,:]
            target_set = input_matrix[:, a : a + day_feed + 5,:,:,:]
            train_14_day = input_set[0][:,:,:,:]

            for k in range(5):
                new_pred = loaded_model.predict(train_14_day[np.newaxis, ::, ::, ::, ::])
                new = new_pred[::, -1, ::, ::, ::]
                train_14_day = np.concatenate((train_14_day, new), axis=0)

            T5_pred = train_14_day[-5:,:,:,:]
            T5_actual = target_set[0][-5:,:,:,:]                    
            rmse = get_rmse(T5_actual,T5_pred)  

            # ----------------------------for baseline RMSE----------------
            T5_baseline = train_14_day[-6:-5,:,:,:]
            baseline = train_14_day[-6:-5,:,:,:]
            for k in range(4):                        
                T5_baseline = np.concatenate((T5_baseline, baseline), axis=0)
            baseline_rmse = get_rmse(T5_actual,T5_baseline)  
            # ----------------------------for baseline RMSE----------------

            if rmse > Max_T_shift_rmse:
                Max_T_shift_rmse = rmse

            if rmse < Min_T_shift_rmse:
                Min_T_shift_rmse = rmse                      

            if rmse > Max_Day_rmse:
                Max_Day_rmse = rmse

            if rmse < Min_Day_rmse:
                Min_Day_rmse = rmse
                

            RMSE_T_shift_total += rmse
            RMSE_Day_total += rmse    
            RMSE_Day_total_baseline += baseline_rmse  # for baseline RMSE----------------
        RMSE_T_shift_avg = RMSE_T_shift_total/len(T_shift)

        Day_store = {}
        Day_store['Avg'] = RMSE_T_shift_avg
        Day_store['Max'] = Max_T_shift_rmse
        Day_store['Min'] = Min_T_shift_rmse

        Day_all_store[day] = Day_store

        Result_Store['{}'.format(sorted_model[str(i)]['Model'])] = Day_all_store
    RMSE_Day_total_avg = RMSE_Day_total/(len(T_shift)*len(Day_Feed))
    RMSE_Day_total_baseline_avg = RMSE_Day_total_baseline/(len(T_shift)*len(Day_Feed)) # for baseline RMSE----------------

    Model_store = Day_all_store
    Model_store['Avg'] = RMSE_Day_total_avg
    Model_store['Max'] = Max_Day_rmse
    Model_store['Min'] = Min_Day_rmse            

    Result_Store['{}'.format(sorted_model[str(i)]['Model'])] = Model_store  

    print(Result_Store)
    print("Baseline RMSE is: {}".format(RMSE_Day_total_baseline_avg))
# save Result_Store to JSON
json.dump(Result_Store, open("./Top_20_Models/Result_Score.json", 'w'))  

In [None]:
# load Result_Store
Result_Store = json.load(open("./Top_20_Models/Result_Score.json", 'r')) 

Result_Store_Sorted = {}
i = 1
for key, value in sorted(Result_Store.items(), key=lambda tup: (tup[1]['Avg'])):
    Model_dict = {}
    Model_dict['Model'] = key
    Model_dict['Avg'] = value['Avg']
    Model_dict['Max'] = value['Max']
    Model_dict['Min'] = value['Min']
    Result_Store_Sorted['{}'.format(i)] = Model_dict  
    i+=1

json.dump(Result_Store_Sorted, open("./Top_20_Models/Result_Store_Sorted.json", 'w'))

print(Result_Store_Sorted)

# Ensemble prediction from Top Model

In [None]:
file = open("./Top_20_Models/Result_Store_Sorted.json", 'r')
sorted_model = json.loads(file.read())

T5_pred_ensemble = np.zeros([6,36,46,1])

# Take Top 1 models only
# Can modify this value to take average of top x Models
# In this case we only use the top model to give the prediction output
Top_Model = 1
for i in range(Top_Model):
    i = i + 1
    s = sorted_model[str(i)]['Model']
    
    #extract the 'day' value from the file title
    day = float(s[s.find('layer')+len('layer'):s.rfind('day')])
    
    # load json and create model
    json_file = open("./Top_20_Models/{}_model.json".format(sorted_model[str(i)]['Model']), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    
    # load weights into model
    loaded_model.load_weights(("./Top_20_Models/{}_model.h5".format(sorted_model[str(i)]['Model'])))
    # print("Loaded model from disk")
    
    input_set = input_matrix[:,:,:,:,:]
    train_14_day = input_set[0][:,:,:,:]
    
    #Predicting T+1 to T+5
    for k in range(5):
        new_pred = loaded_model.predict(train_14_day[np.newaxis, ::, ::, ::, ::])
        new = new_pred[::, -1, ::, ::, ::]
        train_14_day = np.concatenate((train_14_day, new), axis=0)
        
    T5_pred = train_14_day[-6:,:,:,:]
    T5_pred_ensemble += T5_pred

T5_pred_out = np.divide(T5_pred_ensemble,Top_Model)     
    

# Reformat Prediction Output into Dataframe and output to CSV

In [None]:
df_out = df_test

# Keep only the unique geohashes
df_out = pd.DataFrame({'geohash6' : df_out['geohash6'].unique()})

# merge on test dataset geohash with coords_df, drop if no geohash match
df_out = pd.merge(df_out, coords_df, how = 'left', on = ['geohash6'])

#Prediction for T+0 to T+5
for i in range(6):
    Matrix = np.reshape(T5_pred_out[i,:,:,:],(36,46))  
    print('Mapping Data into Matrix: {} of {}.'.format(i,5), end='\r')
    df_out['T+{}'.format(i)] = pd.Series()
    long_group = df_out.groupby('long_index')
    for long_val, long_dfs in long_group:
        lat_group = long_dfs.groupby('lat_index')         
        for lat_val, lat_dfs in lat_group:
            index = lat_dfs.index.tolist()
            df_out.set_value(index, 'T+{}'.format(i),Matrix[long_val,lat_val])
df_out = df_out.drop(columns = ['lat','long','lat_err','long_err','lat_index','long_index',])
df_out.to_csv("./prediction_output.csv",index=False)
df_out.head(1)        