# This note book implements Random Forest emulator for the data from StationSim
The data must be located in the data folder


In [1]:
#load the required packages

import os
import numpy as np
import pandas as pd

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import matplotlib.pyplot as plt
#from gp_emulator import GaussianProcess
from matplotlib.lines import Line2D

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

from sys import path
path.append('..')



In [2]:
# evaluate the trained model
def evaluate(dats, model, scaler, predictsteps, lookbacksteps):
    train_X = dats[0]
    train_y = dats[1]
    test_X = dats[2]
    test_y = dats[3]

    yhat = model.predict(test_X)
    #print(yhat)
    # Rescale values back to the original values
    #test_rescpred=scaler.inverse_transform(yhat)
    #test_rescref=scaler.inverse_transform(test_y)

    test_rescpred=yhat
    test_rescref=test_y
    #print(test_X)

    ## Performance measures
    #seg_mael=[] #MAE list over detectors

    #for j in range(train_X.shape[-1]):
        
    #    seg_mael.append(np.mean(np.abs(test_rescref.T[j]-test_rescpred.T[j]))) #Mean Absolute Error
    
    #return (np.array(seg_mael), test_rescpred, test_rescref)
    return (test_rescpred, test_rescref)


In [3]:
# Prepare the training data for the prediction model
def prepare_data(datatrain, datatest, time_train, time_test, sensor=1, pred=1, lb=1):
    train_dat = np.array(datatrain)
    train_dat = np.nan_to_num(train_dat)
    test_dat = np.array(datatest)
    test_dat = np.nan_to_num(test_dat)
    predictsteps = pred
    lookbacksteps = lb
    whichsensor = sensor

    #Scale on training data (fit and transform)
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_dat = train_dat.astype('float32')
    train_scaled = scaler.fit_transform(train_dat)
    test_scaled = scaler.transform(test_dat)
    
    train_scaled = train_dat
    test_scaled = test_dat
    
    train_X=[]
    train_y=[]
    test_X=[]
    test_y=[]
    
    #prepare the training dataset
    for j in range(len(train_scaled)-int(predictsteps+lookbacksteps-1)):
        if (time_train[j]<=lb) or (time_train[int(predictsteps+lookbacksteps-1)+j]<lb):
            continue

        train_datset=train_scaled[j:lookbacksteps+j,2:]
        train_ycomp=train_scaled[int(predictsteps+lookbacksteps-1)+j,whichsensor+2]
        #print(train_ycomp)
        train_X.append(train_datset)
        train_y.append(train_ycomp)
        
    #prepare the testing dataset
    for j in range(len(test_scaled)-int(predictsteps+lookbacksteps-1)):
        if (time_test[j]<=lb) or (time_test[int(predictsteps+lookbacksteps-1)+j]<lb):
            continue

        test_datset=test_scaled[j:lookbacksteps+j,2:]
        test_ycomp=test_scaled[int(predictsteps+lookbacksteps-1)+j,whichsensor+2]
        #print(test_ycomp)
        test_X.append(test_datset)
        test_y.append(test_ycomp)
    
    train_X = np.squeeze(np.array(train_X))
    train_y = np.array(train_y)
    test_X = np.squeeze(np.array(test_X))
    test_y = np.array(test_y)
    
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)

    return (train_X, train_y, test_X, test_y, scaler)

In [4]:
#now this is the function to train the Random Forest
def train_rf(train_X, train_y, test_X, test_y, scaler, pred, lb):
    #Model creation

    #now train a new RF model
    regressor = RandomForestRegressor(n_estimators=10, random_state=123)
    regressor.fit(train_X, train_y)

    #Evaluate
    dats=(train_X, train_y, test_X, test_y)
    test_rescpred, test_rescref = evaluate(dats, regressor, scaler, pred, lb)
    #print('Model MAE: '+str(np.mean(mael)))
    
    return (test_rescpred, test_rescref)


## First, let's try the case where we know the demand (number of pedestrian) in advance

### Scenario 1: If we know the number of pedestrian in advance, the uncertainty only comes from the fact that StationSim is stochastic


In [7]:
#we make a loop to predict each value in the testing data

for k in [2]:
#for k in range(1,6):
    print("Test data: ", k)
    df_train = pd.read_csv('../data/raw/df_pop_'+str(k)+'00_v6.csv')
    time_train = df_train['# Time']
    df_train = df_train.drop(['# Time'], axis=1)
    df_test = pd.read_csv('../data/validate/df_pop_'+str(k)+'00_test_v6.csv')
    time_test = df_test['# Time']
    df_test = df_test.drop(['# Time'], axis=1)
    #df_test
    datatrain = df_train.values
    datatest = df_test.values
    #predict each step ahead
    for m in [5,10,15]:
    #for m in [1]:
        print("Step ahead: ", m)
        test_values = []
        pred_values = []
        for sensor in range(10):
            #print("Sensor: ", sensor)
            train_X, train_y, test_X, test_y, scaler = prepare_data(datatrain, datatest, time_train, time_test,sensor, m, 1)
            test_rescpred, test_rescref=train_rf(train_X, train_y, test_X, test_y, scaler, m, 5)
            test_values.append(test_rescref)
            pred_values.append(test_rescpred)
        
        #now predict the delay
        train_X, train_y, test_X, test_y, scaler = prepare_data(datatrain, datatest, time_train, time_test,-2, m, 1)
        #print(test_y)
        test_rescpred, test_rescref=train_rf(train_X, train_y, test_X, test_y, scaler, m, 1)
        test_values.append(test_rescref)
        pred_values.append(test_rescpred)
        #now compbine the data into data frames
        test_values = pd.DataFrame(test_values)
        pred_values = pd.DataFrame(pred_values)
        #transpose and save CSV    
        test_values=test_values.T
        pred_values=pred_values.T
        
        pred_values.to_csv("../outputs/RF/Test_pop"+str(k)+"00_predict_SH"+str(m)+"_RF.csv")
        test_values.to_csv("../outputs/RF/Test_pop"+str(k)+"00_real_SH"+str(m)+"_RF.csv")
        

Test data:  2
Step ahead:  5
Step ahead:  10
Step ahead:  15


In [8]:
#Calculate MAE and RMSE
#Each population training and testing RF
from math import sqrt
columns = [0, 1, 2]
maes = pd.DataFrame(columns=columns)
rmses = pd.DataFrame(columns=columns)
for k in range(1,6):
    #print("Evalue data: ", k)
    mae = []
    rmse = []
    for m in [5,10,15]:
        #print("Step ahead: ", m)
        predicts = pd.read_csv("../outputs/RF/Test_pop"+str(k)+"00_predict_SH"+str(m)+"_RF.csv")
        reals = pd.read_csv("../outputs/RF/Test_pop"+str(k)+"00_real_SH"+str(m)+"_RF.csv")
        #print(mean_absolute_error(predicts[0].values, reals[0].values))
        mae.append(mean_absolute_error(predicts['10'].values, reals['10'].values))
        rmse.append(sqrt(mean_squared_error(predicts['10'].values, reals['10'].values)))
    mae = pd.Series(mae)
    #print(mae)
    mae = pd.DataFrame([mae])
    #print(mae)
    rmse = pd.Series(rmse)
    rmse = pd.DataFrame([rmse])
    maes = pd.concat([maes, mae], ignore_index=True)
    rmses = pd.concat([rmses, rmse], ignore_index=True)
print(maes)
print(rmses)

           0          1          2
0  18.210421  20.815306  21.021167
1  22.093419  20.716777  21.530060
2  23.053329  23.588711  23.833653
3  21.374608  23.029229  25.363711
4  24.055510  26.323399  29.512223
           0          1          2
0  24.989606  28.546197  29.112638
1  29.396776  27.784768  28.813802
2  30.361993  29.893897  30.447684
3  29.718542  31.551706  36.165958
4  33.605209  37.691516  46.217295


### Scenario 2: We don't know the number of pedestrians in advance, so we feed the model all the data that we have, and then at the testing phase we see if the model can generalise, now the uncertainty also comes from the fact that the number of pedestrian is unknown to the model 

In [9]:
##### train and test on all dataset
train_files = ['../data/raw/df_pop_'+str(i)+'00_v5.csv' for i in range(1,6)]
df_train = pd.concat([pd.read_csv(f) for f in train_files], ignore_index = True)

time_train = df_train['# Time']
#print(time_train[5])
df_train = df_train.drop(['# Time'], axis=1)
datatrain = df_train.values
    
for k in range(1,6):
    print("Test data: ", k)
    df_test = pd.read_csv('../data/validate/df_pop_'+str(k)+'00_test_v5.csv')
    time_test = df_test['# Time']
    df_test = df_test.drop(['# Time'], axis=1)
    #df_test
    datatest = df_test.values

    for m in [5,10,15]:
    #for m in [1]:
        print("Step ahead: ", m)
        test_values = []
        pred_values = []
        for sensor in range(10):
            #print("Sensor: ", sensor)
            train_X, train_y, test_X, test_y, scaler = prepare_data(datatrain, datatest, time_train, time_test,sensor, m, 1)
            test_rescpred, test_rescref=train_rf(train_X, train_y, test_X, test_y, scaler, m, 5)
            test_values.append(test_rescref)
            pred_values.append(test_rescpred)
        
        #now predict the delay
        train_X, train_y, test_X, test_y, scaler = prepare_data(datatrain, datatest, time_train, time_test,-2, m, 1)
        #print(test_y)
        test_rescpred, test_rescref=train_rf(train_X, train_y, test_X, test_y, scaler, m, 5)
        test_values.append(test_rescref)
        pred_values.append(test_rescpred)
        #now compbine the data into data frames
        test_values = pd.DataFrame(test_values)
        pred_values = pd.DataFrame(pred_values)
        #transpose and save CSV    
        test_values=test_values.T
        pred_values=pred_values.T
        
        pred_values.to_csv("../outputs/RF/All_pop"+str(k)+"00_predict_SH"+str(m)+"_RF.csv")
        test_values.to_csv("../outputs/RF/All_pop"+str(k)+"00_real_SH"+str(m)+"_RF.csv")
     

Test data:  1
Step ahead:  5
Step ahead:  10
Step ahead:  15
Test data:  2
Step ahead:  5
Step ahead:  10
Step ahead:  15
Test data:  3
Step ahead:  5
Step ahead:  10
Step ahead:  15
Test data:  4
Step ahead:  5
Step ahead:  10
Step ahead:  15
Test data:  5
Step ahead:  5
Step ahead:  10
Step ahead:  15


In [10]:
#Calculate MAE and RMSE
#Each population training and testing RF
from math import sqrt
columns = [0, 1, 2]
maes = pd.DataFrame(columns=columns)
rmses = pd.DataFrame(columns=columns)
for k in range(1,6):
    #print("Evalue data: ", k)
    mae = []
    rmse = []
    for m in [5,10,15]:
        #print("Step ahead: ", m)
        predicts = pd.read_csv("../outputs/RF/All_pop"+str(k)+"00_predict_SH"+str(m)+"_RF.csv")
        reals = pd.read_csv("../outputs/RF/All_pop"+str(k)+"00_real_SH"+str(m)+"_RF.csv")
        #print(mean_absolute_error(predicts[0].values, reals[0].values))
        mae.append(mean_absolute_error(predicts['10'].values, reals['10'].values))
        rmse.append(sqrt(mean_squared_error(predicts['10'].values, reals['10'].values)))
    mae = pd.Series(mae)
    #print(mae)
    mae = pd.DataFrame([mae])
    #print(mae)
    rmse = pd.Series(rmse)
    rmse = pd.DataFrame([rmse])
    maes = pd.concat([maes, mae], ignore_index=True)
    rmses = pd.concat([rmses, rmse], ignore_index=True)
print(maes)
print(rmses)

           0          1          2
0  23.575534  23.497798  25.136978
1  29.127502  29.586836  30.426127
2  26.679158  25.999978  26.695012
3  21.249163  23.607179  24.948411
4  32.303084  35.378768  32.846348
           0          1          2
0  31.524104  31.554889  34.404714
1  38.183606  38.712708  39.376664
2  35.259028  33.784200  35.356813
3  29.019660  32.052200  34.243569
4  42.671798  45.580251  43.547064
