#### Make prediction using DSHW

In [15]:
library(tidyr)
library(dplyr)
library(lubridate)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date



In [16]:
HvalerTrainingFile = "Hvaler/training_set.csv"
HvalerCompleteFile = "Hvaler/imputed_complete.csv"
OutputDir = "Hvaler/Predictions/"
HvalerClasses = c('POSIXct', rep("numeric", 21))
Zones = paste0("subs.", seq(1, 20))
Temperatures = c("T01")
Horizons = seq(1, 24)
trainingDf = read.csv(HvalerTrainingFile, stringsAsFactors=FALSE, colClasses=HvalerClasses)
completeDf = read.csv(HvalerCompleteFile, stringsAsFactors=FALSE, colClasses=HvalerClasses)
#This will take several hours!


In [None]:
require('KernSmooth')
require('bbemkr')
require('mgcv')
require('timeDate')
library("MASS")
library('forecast')

#Identify where are the start and end of the prediction periods by shifting index of NA
idxNaCases = !complete.cases(trainingDf)
startPoints =  which(idxNaCases & !c(FALSE, head(idxNaCases, -1)) & c(tail(idxNaCases, -1), TRUE))
endPoints = which(idxNaCases & c(TRUE, head(idxNaCases, -1)) & !c(tail(idxNaCases, -1), FALSE))
startDates = trainingDf$DateTime[startPoints]
endDates = trainingDf$DateTime[endPoints]
nTestingPeriods = length(startDates)

#Create Time Features
startYear = completeDf$DateTime[1]
endYear = tail(completeDf$DateTime, 1)
years = seq(startYear, endYear)
NorwayHolidays = c(EasterMonday(years), Ascension(years), PentecostMonday(years), LaborDay(years), GoodFriday(years), BoxingDay(years), GoodFriday(years)-86400);
completeDf = completeDf %>% mutate(Holiday=DateTime %in% NorwayHolidays) %>%
                            mutate(ChristmasDay= DateTime %in% ChristmasDay(years)) %>%
                            mutate(ChristmasEve= DateTime %in% ChristmasEve(years)) %>%
                            mutate(NewYearsDay= DateTime %in% NewYearsDay(years)) %>%
                            mutate(DoW = factor(wday(DateTime))) %>%
                            mutate(ToY = as.numeric(strftime(DateTime, format = "%j"))+as.numeric(strftime(DateTime, format="%H"))/24)
            

#Assume the zone and temperature here
zones = c("subs.1") #TODO remove
temperatures = c("T01") #TODO remove
horizons = c(1) #TODO remove
for (zone in zones){
    #Find the best correlated temperature with current zone
    maxCor = -1
    bestTemp = temperatures[[1]]
    for (temp in temperatures){
        correlation = cor(completeDf[[zone]], completeDf[[temp]])
        if (correlation > maxCor){
            maxCor = correlation
            bestTemp = temp
        }
    }
    completeDf$T = completeDf[[bestTemp]]
    featureDf = completeDf %>% select(c("DateTime", "Holiday", "ChristmasDay", "ChristmasEve", "NewYearsDay", "DoW", "ToY", "T" ))
    featureDf$y = featureDf[[zone]]
    featureDf$SmoT = featureDf$T
    for (i in 2:length(featureDf$T)){
        featureDf$SmoT[i] = 0.15*featureDf$T[i] + 0.85*featureDf$SmoT[i-1]
    }
    
    featureDf$LT = rep(0, nrow(featureDf))
    for (period in seq(1, nTestingPeriods)){
        #Prepare for training model
        startDate = startDates[period]
        endDate = endDates[period]
        trainData = featureDf %>% filter(DateTime < startDate) %>% select (-DateTime)
        E.xts = xts(trainData$y, trainData$DateTime)
        T.xts = xts(trainData$T, trainData$DateTime)
        trainData$LT = longTermTrend(E.xts, T.xts)
        spec = E ~ LT + s(T)+s(SmoT)+s(ToY,bs="cc", k = 100)+DoW+ChristmasDay+ChristmasEve+NewYearsDay+Holiday
        for (h in horizons){
            trainDataAtH = trainData %>% filter(hour(DateTime)==h)
            model = gam(spec, data=trainData)
            #gam.check(model)

            #Testing
            testingIdx = (hour(featureDf$DateTime)==h) & (featureDf$DateTime >= startDate) & (featureDf$DateTime <= endDate)
            featureDf$LT[testingIdx] = rep(tail(trainData$LT, 1), length(testingIdx))
            testData = featureDf[testingIdx, ] %>% select(-DateTime)
            prediction = predict(model, testData)$fitted.values
            predictions[[h]][testingIdx, zone] = prediction
        }
    }  
}


In [None]:
longTermTrend <- function(E.xts, T.xts, BANDWIDTH=12){
    E.month.xts = apply.monthly(E.xts, FUN="mean", na.rm = TRUE)
    I = as.numeric(format(index(E.month.xts), "%m"))
    I = factor(I)
    T = apply.monthly(T.xts[index(E.xts)], FUN="mean", na.rm = TRUE)
    E.model = gam(E.month.xts ~ I + s(T))#ERROR!!
    E.est = E.model$fitted.values
    E.residuals = E.model$residuals
    a = NadarayaWatsonkernel(1:length(E.residuals), E.residuals, BANDWIDTH, 1:length(E.residuals))
    a$mh
    month = as.numeric(format(index(E.xts), "%m"))
    year = as.numeric(format(index(E.xts), "%y"))

    idx = (year-year[1])*12+(month-month[1])+1

    days = rep(0, length(E.xts))
    for (i in 1:length(a$mh)){
        days[which(idx == i)]  = sum(idx==i)
    }

    fraction = as.numeric(format(index(E.xts), "%d"))*24 + as.numeric(format(index(E.xts), "%H"))

    trend.month = c(a$mh[1]-(a$mh[2]-a$mh[1]), a$mh)
    trend = rep(0, length(E.xts))
    trend = (trend.month[idx+1]-trend.month[idx])*fraction/days + trend.month[idx]
    trend.xts = xts(trend, order.by = index(E.xts))
    trend.xts
}