In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

In [None]:
df_bilt = pd.read_csv('Data/de_bilt_weather.csv')

df_bilt2000 = df_bilt.loc[df_bilt['year'] == 2000].copy()
df_bilt2001 = df_bilt.loc[df_bilt['year'] == 2001].copy()
df_bilt2002 = df_bilt.loc[df_bilt['year'] == 2002].copy()
df_bilt2009 = df_bilt.loc[df_bilt['year'] == 2009].copy()

In [None]:
variables = ['cloud_cover', 'wind_speed', 'wind_gust', 'humidity',   
            'pressure', 'global_radiation', 'precipitation', 'sunshine',
            'temp_mean', 'temp_min', 'temp_max'] 

n_var = len(variables)

In [None]:
# do not include 2010 (no data)
# do not include 2009 (test data)
year_array = np.arange(2000, 2009)

In [None]:
# discrete fourier transform
def DFT(data):
    N = len(data)
    
    # create k and t from 0 to N-1
    # assumes the data starts at t=0
    k_array = np.arange(N)
    t_array = np.arange(N)

    # calculate exponentials with a matrix with |k><t|
    # np.exp as we want element wise and not e^M
    exponentials =  np.exp(-2j * np.pi * np.outer(k_array, t_array) / N)

    # matrix multiplication works
    # as the t-row is multiplied with the data-column
    # thus giving the coef for each k
    coefs = exponentials @ data
    
    # normalize coefs
    return k_array, coefs / N

# reconstructs the data as a sum of cosines
# with A = |X_k| and phaseshift = arg(X_k)
def reconstruct(t_array, k_array, coefs):
    N = len(coefs)

    # create 1D array with x(t)
    data = np.zeros(len(t_array))

    # for every k calculate the entire t_array
    # += sums the vectors
    # could probably be done with a matrix
    for k in k_array:
        A = np.abs(coefs[k])
        phase = np.angle(coefs[k])
        data += A * np.cos(phase + 2 * np.pi * k * t_array / N)

    return data

In [None]:
coefs_list = []

# Fourier each year
# only 365 to avoid leap year issues
n_data = 365
for year in year_array:
    df_selection = df_bilt.loc[df_bilt['year'] == year]
    df_selection = df_selection[:n_data]

    # Fourier each variable
    coefs_matrix = []
    for variable_name in variables:
        data = np.array(df_selection[variable_name])                
        
        k, coefs = DFT(data)
        coefs_matrix.append(coefs)
    
    # save coefs as [year, coef]
    # with index being k
    coefs_matrix = np.array(coefs_matrix)
    coefs_list.append(coefs_matrix)

In [None]:
# average the fourier coefs from 2000-2008
coefs_matrix_mean = np.mean(coefs_list, axis=0)

for i, variable_name in enumerate(variables):
    fig, ax = plt.subplots()
    ax.plot(range(n_data), np.abs(coefs_matrix_mean[i, :]))
    ax.set_yscale('log')
    ax.set_title(variable_name)
    ax.set_xlabel('t')
    ax.set_ylabel(r'$X_t$')

In [None]:
# set threshold
# freqs responsible for less than 0.5% are discarded
threshold = 0.005
k_array = []
for i in range(n_var):
    coefs_matrix = coefs_matrix_mean[i, :].copy()

    # take relative coef
    # how much did the coef matter compared to the total
    coefs_abs = np.abs(coefs_matrix)
    coefs_relative = coefs_abs / np.sum(coefs_abs)

    k_filtered = k[coefs_relative > threshold]
    k_array.append(k_filtered)

# k is not matrix-like, thus dtype must be set
k_array = np.array(k_array, dtype=object)

In [None]:
# create t for a 'full' year (365 days)
t_array = np.arange(n_data)

# restructure data for all years (including test year)
# should be [year, var, t]
data_list = []
for year in list(year_array) + [2009]:

    # separate data for year
    df_selection = df_bilt.loc[df_bilt['year'] == year]
    df_selection = df_selection[:n_data]

    # reshape in to [var, t]
    data_matrix = []
    for variable_name in variables:
        data = np.array(df_selection[variable_name])
        data_matrix.append(data)
    
    # add to list to get [year, var, t]
    data_matrix = np.array(data_matrix)  
    data_list.append(data_matrix)

# create fourier matrix with [var, t]
# reconstruct is coded to accept coefs list with k as the index
data_fourier_matrix = []
for i in range(n_var):
    data_fourier_matrix.append(reconstruct(t_array, k_array[i], coefs_matrix_mean[i, :]))
data_fourier_matrix = np.array(data_fourier_matrix)

In [None]:
for i, variable_name in enumerate(variables):
    prediction = reconstruct(t_array, k_array[i], coefs_matrix_mean[i, :])

    fig, ax = plt.subplots()
    ax.errorbar(t_array, prediction, label='prediction')
    ax.errorbar(t_array, df_bilt2009[variable_name][:n_data], fmt='o', label='observed 2009')
    ax.set_xlabel('t')
    ax.set_ylabel(variable_name) 

In [None]:
# calculate residuals of the fourier [var, t]
fourier_residuals = data_list[:-1] - data_fourier_matrix
big_fourier_residuals = np.hstack(fourier_residuals)

In [None]:
def var_initial_norm(params, data, n_var):
    matrix = params.reshape(n_var, n_var)
    prediction = matrix @ data[:, :-1]
    return np.linalg.norm(data[:, 1:] - prediction)

def fit_var(data):
    # quess logical values to help fit
    params = np.zeros((n_var, n_var)).flatten()
        
    # let scipy perform his magic
    # Powell is slower, but appears to perform better than BFGS
    result = scipy.optimize.minimize(var_initial_norm, params, method='Powell', args=(data, n_var))

    # extract fit result
    M_fit = result.x.reshape(n_var, n_var)

    return M_fit

M_fit = fit_var(big_fourier_residuals)

In [None]:
# return the fourier prediction vector for a single
# quite slow
def F(t, k_array, coefs_matrix):
    
    N = len(coefs_matrix[0, :])
    data = np.zeros(len(k_array))

    i = 0
    for k_list, coefs in zip(k_array, coefs_matrix):
        for k in k_list:
            A = np.abs(coefs[k])
            phase = np.angle(coefs[k])
            data[i] += A * np.cos(phase + 2 * np.pi * k * t / N)
        i += 1

    return data

In [None]:
# create Fourier prediction for an entire 'year' (365 days)
F_array = []
for t in t_array:
    F_array.append(F(t, k_array, coefs_matrix_mean))

# transpose to get [var, T]
F_array = np.array(F_array).T

In [None]:
# calculate residuals of the fourier + VAR
residuals_matrix = []
for year_index in range(len(year_array)):
    data = data_list[year_index]
    residuals = data[:, 1:] - (F_array[:, 1:] + M_fit @ (data[:, :-1] - F_array[:, :-1]))
    residuals_matrix.append(residuals)

residuals_matrix = np.hstack(residuals_matrix)

std = np.std(residuals_matrix, axis=1)
std

residuals_matrix[:, 0]

In [None]:
# plot the residuals per variable
for i, variable_name in enumerate(variables):
    fig, ax = plt.subplots()
    x = np.linspace(-4*std[i], 4*std[i], 1000)
    normal = scipy.stats.norm.pdf(x, 0, std[i])
    ax.plot(x, normal, label='normal distribution')
    ax.hist(residuals_matrix[i, :], density=True, label='residuals')
    ax.legend()
    ax.set_xlabel(variable_name)
    ax.set_ylabel('density')

In [None]:
# fourier prediction + correction by VAR
# did not work (maybe because correlation disappears when taking residuals)
def weather_hybrid(x, M, t, F_array, std):
    return F_array[:, t] + M @ (x - F_array[:, t-1]) + np.random.normal(0, std)

In [None]:
# create prediction list [prediction, var, t]
prediction_list = []
n_predictions = 1000
for j in range(n_predictions):

    # first t is last 'known' datapoint
    prediction = [data_list[-1][:, 0]]
    for t in t_array[1:]:
        prediction.append(weather_hybrid(prediction[-1], M_fit, t, F_array, std))

    # transpose to get [var, t]
    prediction = np.array(prediction).T
    prediction_list.append(prediction)

# [prediction, var, t]
prediction_matrix = np.array(prediction_list)

# calculate 95% CI with [var, t]
lower = np.percentile(prediction_matrix, 2.5, axis=0)
mean = np.mean(prediction_matrix, axis=0)
upper = np.percentile(prediction_matrix, 97.5, axis=0)

In [None]:
# plot for each variable
for i, variable_name in enumerate(variables):
    fig, ax = plt.subplots()
    for j in range(len(prediction_list)):
        ax.errorbar(t_array, prediction_matrix[j, i, :], color='tab:blue', alpha=0.1)

    ax.errorbar(t_array[-1], prediction_matrix[0, i, 0], color='tab:blue', label=f'{n_predictions} predictions')
    ax.errorbar(t_array, F_array[i, :], label='Fourier', color='tab:orange')
    ax.errorbar(t_array, lower[i, :], color='black', fmt='--', label='95% CI')
    ax.errorbar(t_array, upper[i, :], color='black', fmt='--')
    ax.errorbar(t_array, mean[i, :], color='black', fmt='-', label='prediction mean')
    ax.errorbar(t_array, data_list[-1][i, :], label='observed', color='tab:red', fmt='o')
    ax.set(xlabel='days', ylabel=variable_name, xlim=(0, 20))
    ax.legend(loc=1)

In [None]:
plt.errorbar(t_array, df_bilt2009['temp_mean'], label='observed', fmt='o', color='tab:blue', markersize=4)
plt.errorbar(t_array, F_array[8, :], label='Fourier prediction', fmt='-', color='black')
plt.xlim(-10, 400)
plt.ylim(-5, 30)
plt.xlabel('t')
plt.ylabel(r'$y(t)$')
plt.title('Mean temperature 2009')
plt.legend()
plt.savefig('Figures/mean_temperature_2009_fourier_prediction_t365.png', dpi=600)

In [None]:
plt.errorbar(t_array, df_bilt2009['temp_mean'], label='observed', fmt='o', color='tab:blue', markersize=4)
plt.errorbar(t_array, F_array[8, :], label='Fourier prediction', fmt='-', color='black')
plt.xlim(0, 50)
plt.ylim(-4, 8)
plt.xlabel('t')
plt.ylabel(r'$y(t)$')
plt.title('Mean temperature 2009')
plt.legend()
plt.savefig('Figures/mean_temperature_2009_fourier_prediction_t50.png', dpi=600)

In [None]:
fig, ax = plt.subplots()
i = 8
variable_name = variables[i]
for j in range(len(prediction_list)):
    ax.errorbar(t_array, prediction_matrix[j, i, :], color='tab:blue', alpha=0.1)

ax.errorbar(t_array[-1], prediction_matrix[0, i, 0], color='tab:blue', label=f'{n_predictions} predictions')
ax.errorbar(t_array, F_array[i, :], label='Fourier', color='tab:orange')
ax.errorbar(t_array, lower[i, :], color='black', fmt='--', label='95% CI')
ax.errorbar(t_array, upper[i, :], color='black', fmt='--')
ax.errorbar(t_array, mean[i, :], color='black', fmt='-', label='prediction mean')
ax.errorbar(t_array, data_list[-1][i, :], label='observed', color='tab:red', fmt='o')
ax.set(xlabel='t (days)', ylabel='Mean temperature (C)', xlim=(0, 50), ylim=(-10, 40))
ax.set_title('')
ax.legend(loc=1)
fig.savefig('Figures/mean_temperature_2009_fourier_var_prediction_t50.png', dpi=600)

In [None]:
fig, ax = plt.subplots()
i = 8
variable_name = variables[i]
for j in range(len(prediction_list)):
    ax.errorbar(t_array, prediction_matrix[j, i, :], color='tab:blue', alpha=0.1)

ax.errorbar(t_array[-1], prediction_matrix[0, i, 0], color='tab:blue', label=f'{n_predictions} predictions')
ax.errorbar(t_array, F_array[i, :], label='Fourier', color='tab:orange')
ax.errorbar(t_array, lower[i, :], color='black', fmt='--', label='95% CI')
ax.errorbar(t_array, upper[i, :], color='black', fmt='--')
ax.errorbar(t_array, mean[i, :], color='black', fmt='-', label='prediction mean')
ax.errorbar(t_array, data_list[-1][i, :], label='observed', color='tab:red', fmt='o', markersize=3)
ax.set(xlabel='t', ylabel='y(t)', xlim=(0, 380), ylim=(-10, 40))
ax.set_title('2009')
ax.legend(loc=1)
fig.savefig('Figures/mean_temperature_2009_fourier_var_prediction_t365.png', dpi=600)

In [None]:
fig, ax = plt.subplots()
i = 0
variable_name = variables[i]
for j in range(len(prediction_list)):
    ax.errorbar(t_array, prediction_matrix[j, i, :], color='tab:blue', alpha=0.1)

ax.errorbar(t_array[-1], prediction_matrix[0, i, 0], color='tab:blue', label=f'{n_predictions} predictions')
ax.errorbar(t_array, F_array[i, :], label='Fourier', color='tab:orange')
ax.errorbar(t_array, lower[i, :], color='black', fmt='--', label='95% CI')
ax.errorbar(t_array, upper[i, :], color='black', fmt='--')
ax.errorbar(t_array, mean[i, :], color='black', fmt='-', label='prediction mean')
ax.errorbar(t_array, data_list[-1][i, :], label='observed', color='tab:red', fmt='o')
ax.set(xlabel='t (days)', ylabel='Cloud coverage (okta)', xlim=(0, 50), ylim=(-2, 14))
ax.set_title('')
ax.legend(loc=1)
fig.savefig('Figures/cloud_cover_2009_fourier_var_prediction_t50.png', dpi=600)

In [None]:
fig, ax = plt.subplots()
i = 0
variable_name = variables[i]
for j in range(len(prediction_list)):
    ax.errorbar(t_array, prediction_matrix[j, i, :], color='tab:blue', alpha=0.1)

ax.errorbar(t_array[-1], prediction_matrix[0, i, 0], color='tab:blue', label=f'{n_predictions} predictions')
ax.errorbar(t_array, F_array[i, :], label='Fourier', color='tab:orange')
ax.errorbar(t_array, lower[i, :], color='black', fmt='--', label='95% CI')
ax.errorbar(t_array, upper[i, :], color='black', fmt='--')
ax.errorbar(t_array, mean[i, :], color='black', fmt='-', label='prediction mean')
ax.errorbar(t_array, data_list[-1][i, :], label='observed', color='tab:red', fmt='o', markersize=3)
ax.set(xlabel='t (days)', ylabel='Cloud coverage (okta)', xlim=(0, 400), ylim=(-2, 18))
ax.set_title('')
ax.legend(loc=1)
fig.savefig('Figures/cloud_cover_2009_fourier_var_prediction_t365.png', dpi=600)