In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

In [121]:
df_bilt = pd.read_csv('Data/de_bilt_weather.csv')

# Shift days so that the first day starts from zero
df_bilt['days'] = df_bilt['days'] - df_bilt['days'].min()

variables = ['cloud_cover', 'wind_speed', 'wind_gust', 'humidity',
             'pressure', 'global_radiation', 'precipitation', 'sunshine', 
             'temp_mean', 'temp_min', 'temp_max'] 

n_var = len(variables)

In [122]:
#Training on year of data, testing on 10 days
train_range = 365
test_range = 10

In [123]:
# Extracting data with shape (n_var, train_range+test_range)
data = []
for variable_name in variables:
    data.append(df_bilt[variable_name][:train_range + test_range])
data = np.vstack(data)

In [124]:
# Spliting training and test sets
train_data = data[:, :train_range]   
test_data = data[:, train_range:]  

In [125]:
#Cost function for the optimizer
def var_initial_norm(params, data, means, n_var):
    c = params[:n_var]
    matrix = params[n_var:].reshape(n_var, n_var)
    prediction = c[:, None] + matrix @ data[:, :-1]
    residuals = data[:, 1:] - prediction
    return np.linalg.norm(residuals / means[:, None])

In [126]:
#Initialize c and M to zero
params = np.zeros(n_var + n_var**2)
means = np.mean(train_data, axis=1)

#Scipy's optimizer fits model parameters by minimizing cost function
result = scipy.optimize.minimize(var_initial_norm, params, method='Powell', args=(train_data, means, n_var))

#Represents the fitted intercepts and coefficient matrix
c_var = result.x[:n_var]
M_var = result.x[n_var:].reshape(n_var, n_var)

# Calculate residuals
residuals = train_data[:, 1:] - (c_var[:, None] + M_var @ train_data[:, :-1])

#Generating uncertainty in predictions
std_var = np.std(residuals, axis=1)

In [127]:
#Produces predictions for next day given today, fits model plus normally distributed noise for uncertainty.
def weather_var(x, c, M, std):
    return c + M @ x + np.random.normal(0, std)

#Time indices for training and testing
t_train_data = np.arange(train_range)
t_test_data = np.arange(train_range, train_range + test_range)

#Number of simulation runs to capture uncertainty
prediction_list = []
n_predictions = 1000

for j in range(n_predictions):
    #First test day is initial condition
    prediction = [test_data[:, 0]]
    #Predict forward for remaining test days
    for _ in t_test_data[1:]:
        prediction.append(weather_var(prediction[-1], c_var, M_var, std_var))
    prediction = np.array(prediction).T
    prediction_list.append(prediction)

#List of predictions converting to array with shape (n_predictions, n_var, test_range)
prediction_matrix = np.array(prediction_list)
mean = np.mean(prediction_matrix, axis=0)

In [128]:
#Placing mean of predictions in a DF with variables as columns and days as rows
predictions_df = pd.DataFrame(mean.T, columns=variables)

#Correlation matrix of the VARs predictions
corr_matrix = predictions_df.corr()

#Correlation threshold
threshold = 0.5

# Identifying pairs of variables with high correlation
high_corr_pairs = []
for i, var1 in enumerate(variables):
    for var2 in variables[i + 1:]:
        corr_value = corr_matrix.loc[var1, var2]
        if abs(corr_value) > threshold:
            high_corr_pairs.append((var1, var2, corr_value))

In [129]:
#Creating dict from the test period of days, actual variable, and var_forecast of variable
days_test = df_bilt['days'].iloc[train_range:train_range + test_range].values
data_dict = {"days": days_test}

# Adding actual observed values for each variable
for i, variable_name in enumerate(variables):
    data_dict[f"actual_{variable_name}"] = test_data[i, :]

#Adding VAR mean predictions for each variable
for i, variable_name in enumerate(variables):
    data_dict[f"var_forecast_{variable_name}"] = mean[i, :]

df_var = pd.DataFrame(data_dict)

In [130]:
#Importing Standalone LRM mean prediciton values 
files = [
    'Testing/LRM_Data/cloud_cover.csv',
    'Testing/LRM_Data/global_radiation.csv',
    'Testing/LRM_Data/humidity.csv',
    'Testing/LRM_Data/precipitation.csv',
    'Testing/LRM_Data/pressure.csv',
    'Testing/LRM_Data/sunshine.csv',
    'Testing/LRM_Data/temp_max.csv',
    'Testing/LRM_Data/temp_mean.csv',
    'Testing/LRM_Data/temp_min.csv',
    'Testing/LRM_Data/wind_gust.csv',
    'Testing/LRM_Data/wind_speed.csv',
]

df_lrm_merged = None
#Merging each LRM file on days
for i, f in enumerate(files):
    df_temp = pd.read_csv(f)
    if i == 0:
        df_lrm_merged = df_temp
    else:
        df_lrm_merged = pd.merge(df_lrm_merged, df_temp, on = 'days')

#Merging VAR results with df_lrm_merged on days
df_merged = pd.merge(df_var, df_lrm_merged, on='days', suffixes=("", "_lrm"))

# Rename VAR actual columns to standard names
rename_dict = {}
for var in variables:
    old_name = f"actual_{var}_var"
    new_name = f"actual_{var}"
    if old_name in df_merged.columns:
        rename_dict[old_name] = new_name

df_merged.rename(columns=rename_dict, inplace=True)

#Dropping duplicate actual coloumns
for var in variables:
    col_to_drop = f"actual_{var}_lrm"
    if col_to_drop in df_merged.columns:
        df_merged.drop(columns=col_to_drop, inplace=True)

In [131]:
import math

def norm_cdf(z):
    #cdf of the standard normal variable at point z using the error function (erf).
    return 0.5 * (1 + math.erf(z / math.sqrt(2)))

def diebold_mariano_test(d, alternative = 'two-sided'):
    #Converting np array for vectorized operatoring
    d = np.array(d)
    
    #T is the number of observations of loss differentials in the data
    T = d.size
    d_bar = np.mean(d)
    lag = 1

    #gamma stores autocovariances for the loss differential
    gamma = []
    for k in range(T - 1):
        gamma.append((d[k] - d_bar) * (d[k+1] - d_bar))
    gamma = np.array(gamma)

    #Accounting for potential autocorrelation in the differentials with the Newey-West long-run variance estimate for d.
    var_d = (np.sum((d - d_bar)**2) + 2 * np.sum(gamma[:lag])) / T

    #The Diebold-Mariano (DM) statistic:
    dm_stat = d_bar / np.sqrt(var_d / T)

    # two-sided test: p-value = 2 * (1 - cdf(|dm_stat|))
    if alternative == 'two-sided':
        p_value = 2 * (1 - norm_cdf(abs(dm_stat)))
        # one-sided "less": p-value = cdf(dm_stat)
    elif alternative == 'less':
        p_value = norm_cdf(dm_stat)
    # one-sided "greater": p-value = 1 - cdf(dm_stat)
    elif alternative == 'greater':
        p_value = 1 - norm_cdf(dm_stat)

    return dm_stat, p_value

def run_dm_test_for_variable(df, variable):
    # Extract data for this variable
    actual = df[f"actual_{variable}"].values
    var_forecast = df[f"var_forecast_{variable}"].values
    lrm_forecast = df[f"lrm_forecast_{variable}"].values
    
    #Find squared errors of predictions
    var_errors = (actual - var_forecast)**2
    lrm_errors = (actual - lrm_forecast)**2
    
    #Loss differential = LRM loss - VAR loss
    d = lrm_errors - var_errors
    
    # Run the DM test
    dm_stat, p_value = diebold_mariano_test(d, alternative='two-sided')
    
    return {"variable": variable, "DM_stat": dm_stat, "p_value": p_value}

#Iterating for each variable
results = []
for var in variables:
    res = run_dm_test_for_variable(df_merged, var)
    results.append(res)

df_results = pd.DataFrame(results)

In [132]:
#Importing the LRM R^2 file for presentation comparison
df_r2 = pd.read_csv("Testing/LRM_Data/R_2.csv", header = 0)

In [133]:
#Merging df_results with df_r2
df_merged_r2 = pd.merge(df_results, df_r2, left_on='variable', right_on='Variable', how='left')

#Creating column LRM Training R^2 > 0.85
df_merged_r2['LRM Training R^2 > 0.85'] = ''
#Mark coloumn with x if condition met
df_merged_r2.loc[df_merged_r2['R^2 On Training Data'] > 0.85, 'LRM Training R^2 > 0.85'] = 'x'

#Identify VAR variables with |r| > 0.5
high_corr_vars = set()
for v1, v2, corr_val in high_corr_pairs:
    high_corr_vars.add(v1)
    high_corr_vars.add(v2)

#Creating column for VAR correlation condition
df_merged_r2['VAR Correlation |r| > 0.5'] = ''
#Mark column with x if condition met
df_merged_r2.loc[df_merged_r2['variable'].isin(high_corr_vars), 'VAR Correlation |r| > 0.5'] = 'x'

final_df = df_merged_r2[['Variable', 'DM_stat', 'p_value', 'LRM Training R^2 > 0.85', 'VAR Correlation |r| > 0.5']]

styled = final_df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}]) \
                  .set_properties(**{'text-align': 'left'}) \
                  .hide(axis='index')
styled

Variable,DM_stat,p_value,LRM Training R^2 > 0.85,VAR Correlation |r| > 0.5
cloud_cover,0.736613,0.461358,,x
wind_speed,1.849628,0.064367,,x
wind_gust,0.894339,0.371141,,x
humidity,3.174162,0.001503,x,x
pressure,-3.635675,0.000277,x,x
global_radiation,4.377473,1.2e-05,x,x
precipitation,1.455811,0.145445,,x
sunshine,2.833858,0.004599,,x
temp_mean,-2.494775,0.012604,x,x
temp_min,-1.381523,0.167118,x,x
