In [1]:
"""
This file is implementing the vanilla LSTM.
The test set used are the 31 days of December of a single time-serie of the 261 time-series with measurements of the full year 2017.
Missing days are estimated with the closest day forecast. If the temperature is not available, the previous day forecast is used.
Naive basemodels that are used are the mean forecast and the MAPE-minimization.
Evaluation metrics used: MSE, RMSE, NRMSE, MAE, MAPE

"""


'\nThis file is implementing the vanilla LSTM.\nThe test set used are the 31 days of December of a single time-serie of the 261 time-series with measurements of the full year 2017.\nMissing days are estimated with the closest day forecast. If the temperature is not available, the previous day forecast is used.\nNaive basemodels that are used are the mean forecast and the MAPE-minimization. \nEvaluation metrics used: MSE, RMSE, NRMSE, MAE, MAPE\n\n'

In [3]:
import datetime as dt
import pandas as pd # pandas
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import casadi as ca
from Test_basemodel_functions import *

plt.rc('axes', linewidth=2)
plt.rc('axes', labelsize= 16)
plt.rc('axes',titlesize = 18)
plt.rc('legend',fontsize=14)
plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)
plt.rc('figure',figsize=(10,8))


In [4]:
# importing the data
fullYeardata = pd.read_csv("D:\Onedrive\Leuven\Final project\data\Forecasting_writtendata\FullYear.csv",index_col= "date",parse_dates= True)


In [14]:
name = fullYeardata.columns[0]
TS = fullYeardata[name]
training = TS[TS.index.month != 12]
test = TS[TS.index.month == 12]

In [6]:
# remove from the test set all the days that contain nan values -> only estimate real days
test.dropna(inplace=True)

In [10]:
# Now the closest day forecasting model can also look to future days to predict the missing day

def find_most_similar_day_MV(test_dates: pd.DatetimeIndex,serie: pd.Series,temperature: pd.Series,time_steps:int):
    # The assumption is made that the test_dates are one continuous sequence of days
    daily_test_dates = pd.date_range(start=test_dates[0],end=test_dates[-1],freq='D')
    base_forecast = pd.Series(index= test_dates,name= 'closest_day_forecast')
    temperature = norm(temperature)

    for day in daily_test_dates:
        training_days = pd.date_range(start=serie.index[0],end=serie.index[-1],freq='D')
        holidays = EnglandAndWalesHolidayCalendar().holidays(start=serie.index[0],end=serie.index[-1])
        training_days = training_days.symmetric_difference(holidays)
        day_temp = temperature[temperature.index.dayofyear == day.dayofyear][0]

        # if have no information about the temperature of the day --> can't forecast the day
        if np.isnan(day_temp) :
            forecast = np.ones(time_steps)
            forecast[:] = np.nan
            base_forecast[base_forecast.index.dayofyear == day.dayofyear] = forecast
            continue

        if day in holidays:
            # look for holiday and Sunday (to get more days) --> we know that similarity is highest with Sunday
            sundays = training_days[training_days.weekday == 6]
            training_days = sundays.union(holidays)
            cl_day = get_closest_day(training_days,temperature,day_temp)
            forecast = serie[serie.index.dayofyear == cl_day.dayofyear].values
            base_forecast[base_forecast.index.dayofyear == day.dayofyear] = forecast

        else:
            training_days = training_days[training_days.weekday == day.weekday()]
            cl_day = get_closest_day(training_days,temperature,day_temp)
            forecast = serie[serie.index.dayofyear == cl_day.dayofyear].values
            base_forecast[base_forecast.index.dayofyear == day.dayofyear] = forecast

    return base_forecast

def mean_forecast_MV(test_dates: pd.DatetimeIndex,serie: pd.Series):
    # The assumption is made that the test_dates are one continuous sequence of days
    daily_test_dates = pd.date_range(start=test_dates[0],end=test_dates[-1],freq='D')
    base_forecast = pd.Series(index= test_dates,name= "mean_forecast")

    for day in daily_test_dates:
        training_days = pd.date_range(start=serie.index[0],end=serie.index[-1],freq='D')
        holidays = EnglandAndWalesHolidayCalendar().holidays(start=serie.index[0],end=serie.index[-1])
        training_days = training_days.symmetric_difference(holidays)
        df = pd.DataFrame()

        if day in holidays:
            #mean holiday
            for holiday in holidays:
                selected_day = serie[serie.index.dayofyear == holiday.dayofyear].values
                df[str(holiday)] = selected_day
            df_mean = df.mean(axis=1).values.squeeze()
            base_forecast[base_forecast.index.dayofyear == day.dayofyear] = df_mean

        else:
            #mean weekday
            training_days = training_days[training_days.weekday == day.weekday()]
            for training_day in training_days:
                selected_day = serie[serie.index.dayofyear == training_day.dayofyear].values
                df[str(training_day)] = selected_day
            df_mean = df.mean(axis=1).values.squeeze()
            base_forecast[base_forecast.index.dayofyear == day.dayofyear] = df_mean

    return base_forecast

In [32]:
# subsitute the missing days
daily = TS.resample('D').sum()
nan_days = daily[daily == 0].index
for day in nan_days:
    find_most_similar_day_MV(test_dates: pd.DatetimeIndex,serie: pd.Series,temperature: pd.Series,time_steps=48)