In [22]:
# import libraries
library(rminer)
library(rjson)
library(writexl)

# import functions
source("functions.R")

In [2]:
# 0 to enable warnings
# -1 to suppress
options(warn = -1) 

In [3]:
df <- read.csv("../data/sanitizedData.csv", header = TRUE, sep=",")
df <- df[3:length(df)]
head(df)

Unnamed: 0_level_0,all,female,male,young,adult,weather,maxtemp,RH,maxwind,day
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,2332,1115,1108,1122,1210,0,13,87,0,2
2,2801,1217,1459,1239,1562,0,14,94,45,3
3,2375,1168,1099,1059,1316,0,14,82,55,4
4,3447,1617,1651,1606,1841,1,13,78,0,5
5,4823,2469,2117,2318,2505,0,16,81,37,6
6,4978,2564,2223,2289,2689,0,16,73,0,7


## Modeling Function

In [4]:
# get results of a single case scenario
getResults <- function(df, models, del_cols, target_form, time_lags=FALSE, window="none", NPRED=7) {
    df <- df[,del_cols]
    lags = 0
    if (time_lags) {
        lags = c(1:3, 5:7)
        D=CasesSeries(df[,1],lags)
        df[8:nrow(df),c("lag7","lag6", "lag5", "lag3","lag2","lag1")] <- D
        df <- df[-c(0:7),]
    }

    #add <- list()
    # save all results in vectors to build a df
    model_name <- c()
    model_MAE <- c()
    model_NMAE <- c()
    model_RMSE <- c()
    model_RRSE <- c()
    model_time <- c()
    
    for(i in 1:length(models)){
        
        if (models[[i]]$rminer_best_search$mparheuristic) {
            s=list(search=mparheuristic(models[[i]]$rminer,models[[i]]$rminer_best_search$algorithm),method=c("holdoutorder",2/3))
        } else {
            s=models[[i]]$rminer_best_search$algorithm
        }
        
        # training time
        time=0
        if (window == "none") {
            HD=holdout(df[,1],ratio=NPRED,mode="order")
            start_time = Sys.time()
            M=fit(target_form, df[HD$tr,], model=models[[i]]$rminer, search=s)
            end_time = Sys.time()
            time <- as.numeric(difftime(end_time, start_time))
            if (time_lags) {
                #------------------------------------
                predArr <- c()
                count <- 7
                for(m in 1:length(HD$ts)){
                    xArr <- df[HD$ts - count, ][1:count, 1]
                    lag <- append(xArr, predArr)
                    lag <- append(lag, NaN)
                    D2 <- CasesSeries(lag,lags)
                    y <- cbind(df[HD$ts[[m]],(1:(length(df)-length(lags)))], D2[1:length(lags)])
                    PRED <- predict(M,y)
                    predArr <- append(predArr, PRED)
                    count <- count - 1
                }
                #----------------------------------
            } else {
                predArr=predict(M,df[HD$ts,])
            }
            Y=df[HD$ts,1]
            MAE = mmetric(Y,predArr,metric="MAE")
            NMAE = mmetric(Y,predArr,metric="NMAE")
            RMSE = mmetric(Y,predArr,metric="RMSE")
            RRSE = mmetric(Y,predArr,metric="RRSE")
            
        } else {
            L=nrow(df) # size of the time series, 257 - 7
            K=7 # assumption for the seasonal period: test also acf(d1S)
            Test=K # H, the number of multi-ahead steps, adjust if needed
            S=7 # step jump: set in this case to 7 days
            Runs=7 # number of growing/incremental window iterations, adjust if needed
            
            # forecast:
            W=(L-Test)-(Runs-1)*S # initial training window size for the ts space (forecast methods)
            # rminer:
            W2=W-max(lags) # initial training window size for the D space (CasesSeries, rminer methods)
            YR=diff(range(df)) # global Y range, use the same range for the metrics (ex. MAE) calculation in all iterations
            
            # vectors for saving the results
            arr_MAE=vector(length=Runs)
            arr_NMAE=vector(length=Runs)
            arr_RMSE=vector(length=Runs)
            arr_RRSE=vector(length=Runs)
            
            # rolling/incremental window:
            for(b in 1:Runs)  # cycle of the growing/incremental window training
            {
                # code for rminer package methods
                HD=holdout(df[,1],ratio=Test,mode=window,iter=b,window=W2,increment=S)
                start_time = Sys.time()
                M <- fit(target_form,df[HD$tr,],model=models[[i]]$rminer, search=s) # create forecasting model
                end_time = Sys.time()
                time <- time + as.numeric(difftime(end_time, start_time))
                if (time_lags) {
                    #------------------------------------
                    predArr <- c()
                    count <- 7
                    for(m in 1:length(HD$ts)){
                        xArr <- df[HD$ts - count, ][1:count, 1]
                        lag <- append(xArr, predArr)
                        lag <- append(lag, NaN)
                        D2 <- CasesSeries(lag,lags)
                        y <- cbind(df[HD$ts[[m]],(1:(length(df)-length(lags)))], D2[1:length(lags)])
                        PRED <- predict(M,y)
                        predArr <- append(predArr, PRED)
                        count <- count - 1
                    }
                    #----------------------------------
                } else {
                    predArr=predict(M,df[HD$ts,])
                }
                Y=df[HD$ts,1]
                # save results
                arr_MAE[b]=mmetric(y=Y,x=predArr,metric="MAE",val=YR)
                arr_NMAE[b]=mmetric(y=Y,x=predArr,metric="NMAE",val=YR)
                arr_RMSE[b]=mmetric(y=Y,x=predArr,metric="RMSE",val=YR)
                arr_RRSE[b]=mmetric(y=Y,x=predArr,metric="RRSE",val=YR)

            } # end of cycle

            # results
            MAE = mean(arr_MAE)
            NMAE = mean(arr_NMAE)
            RMSE = mean(arr_RMSE)
            RRSE = mean(arr_RRSE)
        }
        
        model_name <- append(model_name, models[[i]]$name)
        model_MAE <- append(model_MAE, MAE)
        model_NMAE <- append(model_NMAE, NMAE)
        model_RMSE <- append(model_RMSE, RMSE)
        model_RRSE <- append(model_RRSE, RRSE)
        model_time <- append(model_time, round(time,5))

        #mgraph(Y,PRF,graph="REG",Grid=10,lty=1,col=c("black","blue"),main=paste(models[[i]]$name,"predictions") ,leg=list(pos="topright",leg=c("target","predictions")))
        #add <- append(add, list(list(Y, PRF)))

    }
    # create data frame with all the results
    results <- data.frame(model=model_name,
                 MAE=model_MAE,
                 NMAE=model_NMAE,
                 RMSE=model_RMSE,
                 RRSE=model_RRSE,
                 training_time=model_time,
                 stringsAsFactors=FALSE)
    results
}

In [5]:
# get results of all scenarios
runCases <- function (target="all", gridSearch=FALSE) {
    arr_cases <- c("1", "2", "3", "4")
    arr_windows <- c("none", "rolling", "incremental")
    arr_time_lags <- c(FALSE, TRUE)
    results <- data.frame()
    for(i in 1:length(arr_cases)){
        # get case
        case <- arr_cases[i]
        case_df <- getCaseDf(df, case)
        v <- getVariables(target, case_df)

        for(l in 1:length(arr_time_lags)){
            time_lags <- arr_time_lags[l]
            path <- paste0("models/best_params/multivariate/multivariate_target=",target,"_case=",case,"_lags=",time_lags,".json")
            
            if (gridSearch) {
                models <- fromJSON(file="models/rminer_models.json")
                # find the best params based on case_df and target
                updated_models <- gridSearch(df=case_df, models=models, del_cols=v$del_cols, target_form=v$target_form, time_lags=time_lags)
                # save the best params
                exportJSON <- toJSON(updated_models, indent=4)
                write(exportJSON, path)  
            }

            for(n in 1:length(arr_windows)){
                window <- arr_windows[n]
                models <- fromJSON(file=path)
                r <- getResults(df=case_df, models=models, del_cols=v$del_cols, target_form=v$target_form, time_lags=time_lags, window=window)
                r[["case"]] <- case
                r[["window"]] <- window
                r[["time_lags"]] <- time_lags
                results <- rbind(results, r)
            }
        }
    }
    col_order <- c("case", "window", "time_lags",
               "model", "MAE", "NMAE", "RMSE", "RRSE", "training_time")
    results <- results[, col_order]
    results <- results[with(results, order(case, window, time_lags, model)), ]
    rownames(results) <- 1:nrow(results)
    write_xlsx(results,paste0("models/results/multivariate/multivariate_target=",target,".xlsx"))
    results
}

### Single case scenario

In [6]:
# select params
target <- "all"
case <- "3"
time_lags <- TRUE
window <- "none"

In [7]:
case_df <- getCaseDf(df, case)
head(case_df)

Unnamed: 0_level_0,all,female,male,young,adult,weather,maxtemp,RH,maxwind,day
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>
1,2332,1115,1108,1122,1210,0,0.24,0.84,0.0,2
2,2801,1217,1459,1239,1562,0,0.28,0.9333333,0.6081081,3
3,2375,1168,1099,1059,1316,0,0.28,0.7733333,0.7432432,4
4,3447,1617,1651,1606,1841,1,0.24,0.72,0.0,5
5,4823,2469,2117,2318,2505,0,0.36,0.76,0.5,6
6,4978,2564,2223,2289,2689,0,0.36,0.6533333,0.0,7


In [8]:
v <- getVariables(target, case_df)
# path to import models
# IMPORTANT: if it doesn't exist, you need to run the next cell
path <-  paste0("models/best_params/multivariate/multivariate_target=",target,"_case=",case,"_lags=",time_lags,".json")

In [9]:
models <- fromJSON(file="models/rminer_models.json")
# find the best params based on case_df and target
updated_models <- gridSearch(df=case_df, models=models, del_cols=v$del_cols, target_form=v$target_form, time_lags=time_lags)
# save the best params
exportJSON <- toJSON(updated_models, indent=4)
write(exportJSON, path)



In [10]:
models <- fromJSON(file=path)
results <- getResults(df=case_df, models=models, del_cols=v$del_cols, target_form=v$target_form, time_lags=time_lags, window=window)



In [11]:
results

model,MAE,NMAE,RMSE,RRSE,training_time
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Random Forest,585.3249,19.49783,664.1197,68.83648,2.10779
Support Vector Machine (KSVM),598.1762,19.92592,863.7736,89.53075,0.12068
Multi Linear Regression (LM),898.9958,29.94656,1195.0489,123.86767,0.002
Multilayer Percepton Ensemble (MLPE),680.2044,22.65837,808.4114,83.79242,1.37692
Naive,994.4603,33.12659,1222.6067,126.72406,0.001
Generalized Linear Model (GLM),800.2282,26.6565,1060.8323,109.95603,0.04488
Decision Three,966.5315,32.19625,1250.5996,129.62553,0.00599
K-Nearest Neighbor (KNN),613.3217,20.43044,754.9955,78.25582,0.0369
Multilayer Percepton (MLP),676.92,22.54897,806.7743,83.62273,1.44746
eXtreme Gradient Boosting (XGB),343.5959,11.44557,393.5308,40.78975,0.55885


### Run all scenarios

### all

In [12]:
target <- "all"
results <- runCases(target=target, gridSearch=FALSE)



In [13]:
results[with(results, order(MAE)), ][1:20,]

Unnamed: 0_level_0,case,window,time_lags,model,MAE,NMAE,RMSE,RRSE,training_time
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
152,3,none,True,eXtreme Gradient Boosting (XGB),343.5959,11.445566,393.5308,40.78975,0.69778
99,2,none,True,Random Forest,484.7081,16.146172,569.1117,58.98884,0.29819
39,1,none,True,Random Forest,534.075,17.790639,607.3017,62.94726,0.24489
96,2,none,True,Multilayer Percepton (MLP),553.9542,18.452839,605.7349,62.78485,1.56206
36,1,none,True,Multilayer Percepton (MLP),553.9609,18.45306,605.7446,62.78585,1.3243
37,1,none,True,Multilayer Percepton Ensemble (MLPE),561.1155,18.691388,610.8033,63.3102,1.20031
234,4,rolling,True,K-Nearest Neighbor (KNN),584.5929,9.439575,751.5263,76.23287,0.00598
194,4,incremental,True,K-Nearest Neighbor (KNN),586.4088,9.468897,751.0616,75.88099,0.004
160,3,none,True,Support Vector Machine (KSVM),598.1762,19.925923,863.7736,89.53075,0.18451
159,3,none,True,Random Forest,598.5766,19.93926,664.4838,68.87422,2.66212


### female

In [14]:
target <- "female"
results <- runCases(target=target, gridSearch=FALSE)



In [15]:
results[with(results, order(MAE)), ][1:20,]

Unnamed: 0_level_0,case,window,time_lags,model,MAE,NMAE,RMSE,RRSE,training_time
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
151,3,none,True,Decision Three,147.7735,10.07318,162.1452,34.37967,0.05984
39,1,none,True,Random Forest,201.3318,13.72405,245.8973,52.13762,0.23794
99,2,none,True,Random Forest,203.9841,13.90484,257.5533,54.60906,0.21451
31,1,none,True,Decision Three,265.0005,18.06411,412.1814,87.39486,0.08058
91,2,none,True,Decision Three,265.0005,18.06411,412.1814,87.39486,0.05767
159,3,none,True,Random Forest,268.3054,18.28939,308.9515,65.50702,0.84323
211,4,none,True,Decision Three,276.5032,18.84821,402.8783,85.42233,0.04189
214,4,none,True,K-Nearest Neighbor (KNN),296.0542,20.18093,352.6519,74.77282,0.0389
154,3,none,True,K-Nearest Neighbor (KNN),308.8776,21.05505,410.7291,87.08692,0.0249
146,3,none,False,Multilayer Percepton (MLP),310.2046,21.14551,334.4949,70.92299,0.20601


### male

In [16]:
target <- "male"
results <- runCases(target=target, gridSearch=FALSE)



In [17]:
results[with(results, order(MAE)), ][1:20,]

Unnamed: 0_level_0,case,window,time_lags,model,MAE,NMAE,RMSE,RRSE,training_time
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
39,1,none,True,Random Forest,141.6709,10.409328,178.3071,40.25819,3.1749
152,3,none,True,eXtreme Gradient Boosting (XGB),164.9338,12.118576,251.1679,56.70871,0.52049
150,3,none,False,Support Vector Machine (KSVM),168.7971,12.40243,210.8686,47.60993,0.25225
159,3,none,True,Random Forest,230.1528,16.910569,264.9293,59.81576,0.21061
151,3,none,True,Decision Three,232.7194,17.099147,267.5917,60.41688,0.05984
40,1,none,True,Support Vector Machine (KSVM),241.918,17.77502,340.4295,76.86221,0.31672
100,2,none,True,Support Vector Machine (KSVM),241.918,17.77502,340.4295,76.86221,0.27508
211,4,none,True,Decision Three,253.2048,32.296535,285.839,100.0021,0.04588
160,3,none,True,Support Vector Machine (KSVM),256.3832,18.837856,389.3839,87.91512,0.10929
210,4,none,False,Support Vector Machine (KSVM),262.8186,33.522781,300.0855,104.98631,0.15459


### young

In [18]:
target <- "young"
results <- runCases(target=target, gridSearch=FALSE)



In [19]:
results[with(results, order(MAE)), ][1:20,]

Unnamed: 0_level_0,case,window,time_lags,model,MAE,NMAE,RMSE,RRSE,training_time
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
39,1,none,True,Random Forest,170.8407,11.465819,208.0403,42.63219,0.21739
99,2,none,True,Random Forest,171.6106,11.517487,204.8068,41.96957,0.21144
233,4,rolling,True,Generalized Linear Model (GLM),267.7605,9.965034,338.5191,79.62315,0.71716
193,4,incremental,True,Generalized Linear Model (GLM),267.934,9.971493,349.7545,82.70986,0.86583
96,2,none,True,Multilayer Percepton (MLP),269.7014,18.100768,307.7905,63.07328,1.07504
36,1,none,True,Multilayer Percepton (MLP),269.7087,18.101257,307.7991,63.07502,0.45171
234,4,rolling,True,K-Nearest Neighbor (KNN),269.9395,10.046128,331.5536,79.13214,0.28881
199,4,incremental,True,Random Forest,271.456,10.102567,347.7349,82.3945,1.3061
235,4,rolling,True,Multi Linear Regression (LM),271.9036,10.119226,339.5755,81.09216,0.01201
159,3,none,True,Random Forest,272.3834,18.280768,318.1517,65.19652,0.20717


### adult

In [20]:
target <- "adult"
results <- runCases(target=target, gridSearch=FALSE)



In [21]:
results[with(results, order(MAE)), ][1:20,]

Unnamed: 0_level_0,case,window,time_lags,model,MAE,NMAE,RMSE,RRSE,training_time
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
220,4,none,True,Support Vector Machine (KSVM),201.8552,12.8162,278.7617,57.5708,0.0638
214,4,none,True,K-Nearest Neighbor (KNN),210.1663,13.34389,277.6029,57.33147,0.0379
206,4,none,False,Multilayer Percepton (MLP),242.8368,15.41821,342.7326,70.78227,0.43297
146,3,none,False,Multilayer Percepton (MLP),248.7318,15.79249,358.0821,73.95232,0.02394
152,3,none,True,eXtreme Gradient Boosting (XGB),249.3574,15.83221,325.1589,67.15291,1.0111
159,3,none,True,Random Forest,255.2682,16.2075,296.5395,61.24234,0.84266
150,3,none,False,Support Vector Machine (KSVM),258.9359,16.44037,348.0249,71.87527,1.29325
216,4,none,True,Multilayer Percepton (MLP),267.2809,16.97021,373.725,77.18294,1.14274
39,1,none,True,Random Forest,282.3106,17.92448,306.4342,63.28582,0.22869
210,4,none,False,Support Vector Machine (KSVM),286.8701,18.21398,378.4294,78.15451,1.32091
