### Tidy the GEFCom2012 dataset
From 3 file, load_raw.csv, n034_ensemble.csv and temp.csv create the complete.csv file. Use value from ensemble prediction to impute missing value in load_raw, and add temperature values from temp.csv to the final complete dataframe.

#### Read data from csv file

In [22]:
library(tidyr)
library(dplyr)
library(lubridate)

loadRawFile = "GEFCom2012/load_raw.csv"
tempRawFile = "GEFCom2012/temp.csv"
ensemblePredictionFile = "GEFCom2012/n034_ensemble.csv"

#Define a class num.with.commas to probably transform string with comma to number
setClass("num.with.commas")
setAs("character", "num.with.commas", 
        function(from) as.numeric(gsub(",", "", from) ) )

loadRawClasses = c('factor', rep("numeric", 3), rep("num.with.commas", 24))

df = read.csv(loadRawFile, stringsAsFactors=FALSE, colClasses=loadRawClasses)

#### Tidy loadraw GEFCom2012 data

In [23]:
tidyLoadDf = df %>%  gather(Hour, Consumption, h1:h24) %>%
                 mutate(Hour=as.numeric(substr(Hour, 2, nchar(Hour)))-1) %>%
                 unite(Date, year, month, day, sep='-') %>%
                 unite(DateTime, Date, Hour, sep=' ') %>%
                 mutate(DateTime=as.POSIXct(strptime(DateTime, "%Y-%m-%d %H", tz = "GMT"))) %>%
                 arrange(DateTime, zone_id)


In [24]:
#Spead consumption for each zone into seperate columns, so that we can easily select consumption for each zone later
tidyLoadDf = tidyLoadDf %>% mutate(zone_id=paste0('zone.',as.character(zone_id))) %>%
                      spread(zone_id, Consumption, fill = NA, convert = FALSE)
#Remove 2008/June data, because it's not complete, Create Zone 21 as sum of 20 zones consumption
tidyLoadDf = tidyLoadDf %>% filter(DateTime<as.POSIXct(strptime("2008-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")))
#Add total consumption collumn
tidyLoadDf %>% select(zone.1:zone.20) %>% rowSums(na.rm=TRUE) -> tidyLoadDf$total

#### Tidy ensemble prediction GEFCom2012 data

In [25]:
ensembleClasses = c('NULL', 'factor', rep("numeric", 27))
df = read.csv(ensemblePredictionFile, stringsAsFactors=FALSE, colClasses=ensembleClasses)
tidyEnsembleDf = df %>%  gather(Hour, Consumption, h1:h24) %>%
                 mutate(Hour=as.numeric(substr(Hour, 2, nchar(Hour)))-1) %>%
                 unite(Date, year, month, day, sep='-') %>%
                 unite(DateTime, Date, Hour, sep=' ') %>%
                 mutate(DateTime=as.POSIXct(strptime(DateTime, "%Y-%m-%d %H", tz = "GMT"))) %>%
                 arrange(DateTime) %>%
                 mutate(zone_id=paste0('zone.',as.character(zone_id))) %>%
                 spread(zone_id, Consumption, fill = NA, convert = FALSE) %>% #Remove forecast, only need backcast
                 filter(DateTime<as.POSIXct(strptime("2008-06-01 00:00:00", "%Y-%m-%d %H:%M:%S"))) 

#### Merge ensemble prediction to raw load data, and add temperature data

In [26]:
fullDf = tidyLoadDf
fullDf[is.na(fullDf$zone.1), ] = tidyEnsembleDf

In [27]:
tempClasses = c('POSIXct', rep("numeric", 11))
tempDf = read.csv(tempRawFile, stringsAsFactors=FALSE, colClasses=tempClasses)
tempDf = tempDf %>% mutate(DateTime = DateTime - minutes(30))
completeDf = left_join(fullDf, tempDf, by="DateTime")
#Check if there is any not complete case
completeDf %>% filter(!complete.cases(.))

In [28]:
write.csv(completeDf, "GEFCom2012/complete.csv", row.names=FALSE)

In [29]:
head(completeDf)

Unnamed: 0,DateTime,zone.1,zone.10,zone.11,zone.12,zone.13,zone.14,zone.15,zone.16,zone.17,⋯,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11
1,2004-01-01 00:00:00,16853,23339,90700,118378,20673,21791,65970,28752,30645,⋯,38,44,45,42,44,45,43,41,42,36
2,2004-01-01 01:00:00,16450,22100,86699,112480,19666,21400,64600,27851,30461,⋯,36,42,43,42,43,44,44,39,43,32
3,2004-01-01 02:00:00,16517,21376,84243,108435,19020,20998,63843,27631,30197,⋯,35,40,41,40,42,41,42,36,43,31
4,2004-01-01 03:00:00,16873,21335,84285,107224,18841,21214,64023,27986,30264,⋯,30,36,37,39,38,40,34,35,39,30
5,2004-01-01 04:00:00,17064,21564,86087,108870,19310,21830,65679,29160,30907,⋯,30,34,33,40,38,35,30,33,35,34
6,2004-01-01 05:00:00,17727,22241,90210,112395,19415,21794,63305,29226,31617,⋯,29,32,32,41,37,35,35,36,35,35
