-
Notifications
You must be signed in to change notification settings - Fork 4
/
Elasticnet_predictions.R
122 lines (96 loc) · 4.7 KB
/
Elasticnet_predictions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#Loading required R packages
library(tidyverse)
library(caret)
library(rpart)
library(party)
library(MLmetrics)
path1 = '../../all_variables_and_GPI_monthly_all_countries/'
path2 = '../../elnet_results/'
country_files = list.files(path1, pattern="*.csv")
for (i in country_files){
coun<-strsplit(i,"_")[[1]][[3]]
country<-strsplit(coun, ".", fixed = TRUE)[[1]][[1]]
print(country)
#Load the data
file_df <- file.path(path1, paste('all_variables_', country, '.csv', sep = ''))
if (file.exists(file_df)){
df_country_initial<- read.csv(file_df, stringsAsFactors = FALSE)
rownames(df_country_initial) <- df_country_initial$MonthYear
drops <- c("MonthYear")
df_country<-df_country_initial[ , !(names(df_country_initial) %in% drops)]
## Remove columns with more than 60% zeros
df_country <-df_country[, which(as.numeric(colSums(df_country != 0)) > nrow(df_country)*0.6)]
#Split the data into training and test set
#Create the training and test set
#Set the train percentage
train_set<-0.5
train.data <- head(df_country, round(length(df_country$GPI) * train_set))
h <- length(df_country$GPI) - length(train.data$GPI)
test.data <- tail(df_country, h)
#The dataframe with the most important variables per rolling
df_important_var <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), c("var_name", "MonthYear"))
#Create cp dataframe
df_cp <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), c("MonthYear", "cp"))
#Create predictions dataframe
df_predictions <- setNames(data.frame(matrix(ncol = 13, nrow = 0)), c("MonthYear", "Predictions1", "Predictions2", "Predictions3", "Predictions4", "Predictions5", "Predictions6", "Predictions7", "Predictions8", "Predictions9", "Predictions10", "Predictions11", "Predictions12"))
k=0
for (i in (1:(nrow(test.data)))) {
print(i)
model <- train(
GPI ~., data = train.data, method = "glmnet",
trControl = trainControl(method = "timeslice", initialWindow = 12, horizon = 6, fixedWindow = FALSE, allowParallel = TRUE, number = 10)
)
#Create a dataframe with the variables' importance
imp_var <- varImp(model)$importance
#Keep variable names from the index to seperate column
imp_var <- rownames_to_column(imp_var, var = "var_name")
#Add a column MonthYear
imp_var$MonthYear <- rep(c(tail(row.names(train.data), n = 1)), times = nrow(imp_var))
#Add the dataframe to the bigger dataframe
df_important_var <- rbind(df_important_var, imp_var)
# Complexity parameter
cp <- model$bestTune
#Add a column MonthYear
cp$MonthYear <- rep(c(tail(row.names(train.data), n = 1)), times = nrow(cp))
#Add the dataframe to the bigger dataframe
df_cp <- rbind(df_cp, cp)
if ((nrow(test.data) - i) < 11){
k = k + 1
}
predictions <-model %>% predict(test.data[i:(i+11-k),])
predictions <- as.numeric(unlist(predictions))
predictions <- c(tail(row.names(train.data), n = 1), predictions)
if (k>0){
for (l in (1:k)){
predictions <- c(predictions, '-')
}
}
df_predictions[(nrow(df_predictions) + 1), ] <- predictions
train.data <- rbind(train.data[2:nrow(train.data),], test.data[i:i,])
}
#Save the important variables per rolling
write.csv(df_important_var, file.path (path2, paste(country, '_elnet_', train_set, '_impvar.csv', sep = '')))
#Save the cp dataframe
write.csv(df_cp, file.path (path2, paste(country, '_elnet_', train_set, '_cp.csv', sep = '')))
#Save the predictions
write.csv(df_predictions, file.path (path2, paste(country, '_elnet_', train_set, '_predictions.csv', sep = '')))
df_results_analytics <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("Pearson", "RMSE", "MAPE"))
j = 0
for (predname in colnames(df_predictions)[2:13]){
actualpreds <- as.numeric(df_predictions[[predname]])
actualpreds <- actualpreds[!is.na(actualpreds)]
actualtest <- tail(test.data, nrow(test.data)-j)
# Model performance metrics
results_analytics <- data.frame(
Pearson = cor(actualtest$GPI, actualpreds, method = "pearson"),
RMSE = RMSE(actualpreds, actualtest$GPI),
Mape = MAPE(actualpreds, actualtest$GPI)*100
)
df_results_analytics[(nrow(df_results_analytics) + 1), ] <- results_analytics
j = j + 1
}
write.csv(df_results_analytics, file.path (path2, paste(country, '_elnet_', train_set, '_results.csv', sep = '')), row.names=T)
}
#Confirm remove all objects before going to the next interaction
#rm(list = ls(all.names = TRUE))
}