In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.subplots as sp
from influxdb_client import InfluxDBClient, Point
from influxdb_client.client.write_api import SYNCHRONOUS
import json
from datetime import datetime
import plotly.express as px
import tensorflow as tfs

from astral import LocationInfo
from astral.sun import sun
import datetime
import pytz

from scipy.signal import savgol_filter
import plotly.graph_objects as go

import requests


# weather data

In [None]:
file_path = r"C:\Users\samr0\OneDrive - KU Leuven\Documents\!School\master\Thesis\data\aws_10min.csv"

df = pd.read_csv(file_path, index_col='timestamp', parse_dates=True)
cutoff_timestamp = "2022-06-19 04:20:00"

df = df.loc[:cutoff_timestamp]
df

In [None]:
file_path = r"C:\Users\samr0\OneDrive - KU Leuven\Documents\!School\master\Thesis\data\aws_10min_rest.csv"

df2 = pd.read_csv(file_path, index_col='timestamp', parse_dates=True)
cutoff_timestamp = "2022-06-19 04:30:00"

df2 = df2.loc[cutoff_timestamp:]
df2

In [None]:
df_combined = pd.concat([df, df2])
df_combined

In [None]:
time_diffs = df_combined.index.to_series().diff().dropna()
print(time_diffs.value_counts())

In [None]:
#haversine formula to compute the great-circle distance between two points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c  #distance in km

In [None]:
print(df_combined['code'].unique())

In [None]:
df_unique_stations = df_combined.drop_duplicates(subset="code", keep="first")
df_unique_stations[['lat', 'lon']] = df_unique_stations['the_geom'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)').astype(float)

df_unique_stations

In [None]:
#read in the households file with lats and longs + only the ids that are in the database
#json files
file_path = r"C:\Users\samr0\OneDrive - KU Leuven\Documents\!School\master\Thesis\data\households_in_database.json"
#read JSON into a dataFrame
df_households = pd.read_json(file_path)

df_households.head()

In [None]:
pd.set_option('display.max_columns', None)

household_id = "7847f5f7"
row = df_households[df_households["id"] == household_id]
lat = row["latitude"]
lon = row["longitude"]

df_unique_stations['distance_km'] = df_unique_stations.apply(lambda row: haversine(lat, lon, row['lat'], row['lon']), axis=1)
df_unique_stations = df_unique_stations.sort_values("distance_km")
df_unique_stations

In [None]:
np.array(df_unique_stations["code"])

In [None]:
# add the ghi values of the database
file_path = r"C:\Users\samr0\OneDrive - KU Leuven\Documents\!School\master\Thesis\data\meteoStationsDatabaseData.csv"

meteoStationsData = pd.read_csv(file_path, index_col='_time', parse_dates=True)
meteoStationsData["code"] = meteoStationsData["nodeId"].str[-4:].astype(int)
meteoStationsData

In [None]:
time_diffs = meteoStationsData.index.to_series().diff().dropna()
print(time_diffs.value_counts())

In [None]:
codesInDatabase = [6434, 6438, 6455, 6459, 6464, 6472, 6477, 6484]
#get only data of stations in the database
df_meteo = df_combined[df_combined["code"].isin(codesInDatabase)]
df_meteo

In [None]:
df_meteo.index = pd.to_datetime(df_meteo.index)
#reset index to merge on both timestamp and code
meteoStationsData_reset = meteoStationsData.reset_index()
df_meteo_reset = df_meteo.reset_index()

df_meteo_reset.rename(columns={'timestamp': '_time'}, inplace=True)

df_meteo_reset['_time'] = pd.to_datetime(df_meteo_reset['_time'])
df_meteo_reset['_time'] = df_meteo_reset['_time'].dt.tz_localize('UTC')


#merge on both timestamp and code
merged_df = pd.merge(meteoStationsData_reset, df_meteo_reset, on=['_time', 'code'], how='inner')

#set timestamp back as index
merged_df.set_index('_time', inplace=True)

merged_df

In [None]:
df_unique_stations = merged_df.drop_duplicates(subset="code", keep="first")
df_unique_stations[['lat', 'lon']] = df_unique_stations['the_geom'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)').astype(float)
df_unique_stations['distance_km'] = df_unique_stations.apply(lambda row: haversine(lat, lon, row['lat'], row['lon']), axis=1)
df_unique_stations = df_unique_stations.sort_values("distance_km")
df_unique_stations

# if not including the ghi data
merged_df = df_combined
merged_df = merged_df.rename_axis('_time')
merged_df['_time'] = pd.to_datetime(merged_df.index)
merged_df['_time'] = merged_df['_time'].dt.tz_localize('UTC')
merged_df

In [None]:
#get 3 closest ones and selection
df_closest = df_unique_stations[:3]

df_closest

In [None]:
unique_codes = df_closest["code"].unique()
unique_codes_list = unique_codes.tolist()
unique_codes_list

In [None]:
#get all data from nearby stations
df_meteo = merged_df[merged_df["code"].isin(unique_codes_list)]
df_meteo

In [None]:
df_meteo['wind_speed'] = df_meteo['wind_speed_10m'].combine_first(df_meteo['wind_speed_avg_30m'])

df_meteo = df_meteo.drop(columns = ["FID", "the_geom", "temp_grass_pt100_avg", "temp_soil_avg_5cm",
                                            "temp_soil_avg_10cm", "temp_soil_avg_20cm", "temp_soil_avg_50cm",
                                            "qc_flags", "wind_speed_10m", "wind_speed_avg_30m"])
df_meteo

In [None]:
#add lat, lon and distance_km
df_selection = df_closest[['code', 'lat', 'lon', 'distance_km']]

df_meteo = df_meteo.reset_index()

df_meteo = pd.merge(df_meteo, df_selection, on="code", how="left")

df_meteo = df_meteo.set_index("_time")
df_meteo

In [None]:
#get average weather taking distance into account

#define a function to calculate the weighted average for a given column
def weighted_average(group, weight_column='distance_km'):
    #calculate the weights as the inverse of distance (closer stations get higher weight)
    weights = 1 / group[weight_column]
    
    #compute the weighted average for each column in the group
    return (group.drop(columns=[weight_column]).multiply(weights, axis=0)).sum() / weights.sum()

#drop the nodeId, not a number
df_meteo = df_meteo.drop(columns = ["nodeId"])

#group by timestamp and apply the weighted average function to each group
df_meteo_avg_weighted = df_meteo.groupby(df_meteo.index).apply(weighted_average)

df_meteo_avg_weighted

In [None]:
time_diffs = df_meteo_avg_weighted.index.to_series().diff().dropna()
print(time_diffs.value_counts())

In [None]:
df_meteo_avg_weighted.isna().sum()

In [None]:
df_weather_resampled = df_meteo_avg_weighted.resample('15min').mean()
df_weather_resampled

In [None]:
df_weather_filled = df_meteo_avg_weighted.resample('15min').ffill()

df_weather_filled

In [None]:
#handle NaN values
df_weather_resampled.isna().sum()

In [None]:
#display rows with NaN values
nan_rows = df_weather_resampled[df_weather_resampled.isna().any(axis=1)]

#show the rows containing NaN values
print(nan_rows)



# add the data

In [None]:
file_path = r"C:\Users\samr0\OneDrive - KU Leuven\Documents\!School\master\Thesis\data\inverter_power_data_7847f5f7_normalised_15min.csv"

df_data = pd.read_csv(file_path, index_col='_time', parse_dates=True)
#df_data.index = df_data.index.tz_convert('Europe/Brussels')

df_data

# WITH GHI DATA WE LIMIT THE AMOUNT OF AVAILABLE DATA
# USE LEFT JOIN FOR ALL DATA, BUT THEN NAN VALUES

In [None]:
#combine data with weather
df_data = df_data.reset_index()

df_data['_time'] = pd.to_datetime(df_data['_time'])
df_data['_time'] = df_data['_time'].dt.tz_localize('UTC')
df_data.set_index('_time', inplace=True)


df_data = pd.merge(df_data, df_weather_resampled, how='right', left_index=True, right_index=True)
df_data

In [None]:
from scipy.signal import savgol_filter
import plotly.graph_objects as go

x = df_data.index
y = df_data['normalized_value']

#apply Savitzky-Golay filter
smoothed_y = savgol_filter(y, window_length=60*4+1, polyorder=2)

#store residuals
residuals = y - smoothed_y

#reconstruct the original signal
reconstructed_y = smoothed_y + residuals

fig2 = go.Figure()

fig2.add_trace(go.Scatter(x=x, y=y, mode='lines', name='original Power'))
fig2.add_trace(go.Scatter(x=x, y=smoothed_y, mode='lines', name='smoothed Power'))
fig2.add_trace(go.Scatter(x=x, y=reconstructed_y, mode='lines', name='reconstructed Power'))

#update layout for better visualization
fig2.update_layout(
    title='Savitzky-Golay Smoothing and Reconstruction',
    xaxis_title='Time',
    yaxis_title='Mean Actual Power (W)',
    xaxis_rangeslider_visible=True
)

# show the plot
fig2.show()

In [None]:
#extra features

df_data['hour'] = df_data.index.hour
df_data['day_of_week'] = df_data.index.dayofweek
df_data['month'] = df_data.index.month

df_data['hour_sin'] = np.sin(2 * np.pi * df_data['hour'] / 24)
df_data['hour_cos'] = np.cos(2 * np.pi * df_data['hour'] / 24)
df_data['day_of_year'] = df_data.index.dayofyear
df_data['day_of_year_sin'] = np.sin(2 * np.pi * df_data['day_of_year'] / 365)
df_data['day_of_year_cos'] = np.cos(2 * np.pi * df_data['day_of_year'] / 365)

df_data['minute'] = df_data.index.minute

# Encode the 15-minute intervals within an hour
df_data['minute_sin'] = np.sin(2 * np.pi * df_data['minute'] / 60)
df_data['minute_cos'] = np.cos(2 * np.pi * df_data['minute'] / 60)


#time_windows = ['15min', '30min', '45min']
#for window in time_windows:
#    df_data[f'ma_{window}'] = (
#        df_data['normalized_value']
#        .rolling(window=window, min_periods=1)
#        .mean()
#        .shift(1) #makes it lagged MA
#    )

# Drop rows with NaN values introduced by moving averages
#df_data = df_data.dropna()

In [None]:
df_data

In [None]:
print(df_data["mean_actualPowerTot_W_inverter"].isna().sum())
df_data = df_data.dropna()

In [None]:
from sklearn.preprocessing import MinMaxScaler

#initialize the scaler
scaler = MinMaxScaler()

columns_to_normalize = [
    'precip_quantity', 
    'temp_dry_shelter_avg', 
    'temp_soil_avg',
    'wind_direction',
    'wind_gusts_speed', 
    'humidity_rel_shelter_avg', 
    'pressure', 
    'sun_duration', 
    'short_wave_from_sky_avg', 
    'sun_int_avg', 
    'wind_speed',
'diffuseIrradiance_Wpm2',
'directNormalIrradiance_Wpm2',
'globalHorizontalIrradiance_Wpm2'
]

#apply MinMax scaling only to mean_actualPowerTot_W_inverter
df_data[columns_to_normalize] = scaler.fit_transform(df_data[columns_to_normalize])
df_data


In [None]:
duplicates = df_data.index.duplicated()
print("number of duplicates: ", duplicates.sum())
df_data = df_data[~df_data.index.duplicated(keep='first')]

In [None]:
#extra shift of 24 hours ago
df_data["normalized_value_shift_24"] = df_data[['normalized_value']].shift(freq='D')
df_data = df_data.dropna()
df_data

In [None]:
correlation_matrix = df_data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[['mean_actualPowerTot_W_inverter']].sort_values(by='mean_actualPowerTot_W_inverter', ascending=False), 
            annot=True, 
            cmap='coolwarm', 
            vmin=-1, vmax=1,
            cbar_kws={'label': 'Correlation coefficient'})

plt.title("Correlation of features with 'mean_actualPowerTot_W_inverter'")

plt.show()

In [None]:
df_corr_test = df_data


df_corr_test = df_corr_test.drop(columns = ["short_wave_from_sky_avg", "adjusted_P_max", "scaled_adjusted_P_max", "sun_int_avg",
                                           "lat", "lon", "night", "normalized_value", "code", "minute", "hour", "day_of_year"])

df_corr_test = df_corr_test.rename(columns={
    "mean_actualPowerTot_W_inverter": "Mean actual power",
    "globalHorizontalIrradiance_Wpm2": "Global horizontal irradiance",
    "directNormalIrradiance_Wpm2": "Direct normal irradiance",
    "diffuseIrradiance_Wpm2": "Diffuse irradiance",
    "sun_duration": "Sun duration",
    "temp_soil_avg": "Temp soil avg",
    "temp_dry_shelter_avg": "Temp dry shelter avg",
    "day_of_year_sin": "Day of year sin",
    "wind_gusts_speed": "Wind gusts speed",
    "wind_speed": "Wind speed",
    "hour_sin": "Hour sin",
    "normalized_value_shift_24": "Normalized value shift 24",
    "day_of_week": "Day of week",
    "minute_sin": "Minute sin",
    "minute_cos": "Minute cos",
    "precip_quantity": "Precip quantity",
    "wind_direction": "Wind direction",
    "day_of_year_cos": "Day of year cos",
    "hour_cos": "Hour cos",
    "humidity_rel_shelter_avg": "Humidity rel shelter avg"
})


correlation_matrix = df_corr_test.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[['Mean actual power']].sort_values(by='Mean actual power', ascending=False), 
            annot=True, 
            cmap='coolwarm', 
            vmin=-1, vmax=1,
            cbar_kws={'label': 'Correlation coefficient'})

plt.title("Correlation of features with 'Mean actual power'")

plt.show()

# train - cv - test split

In [None]:
#parameters
input_steps = 4*8
output_steps = 1    #predict one timestep ahead
step_size_train = 1
step_size_test = 1      #shift for testing
step_size_val = 1

#select relevant columns for features and target
features = [
    "normalized_value",
    #"mean_actualPowerTot_W_inverter_scaled",
     #"day_of_week", "month", 
    "hour_sin", "hour_cos", 
    #"day_of_year_sin", "day_of_year_cos",
    "minute_sin", "minute_cos",
    "temp_dry_shelter_avg",
    "normalized_value_shift_24",
    'diffuseIrradiance_Wpm2',
    'directNormalIrradiance_Wpm2',
    'globalHorizontalIrradiance_Wpm2'
]

target = "normalized_value"
#target = "mean_actualPowerTot_W_inverter_scaled"

In [None]:
def generate_sliding_window(data, input_steps=1440, output_steps=1, feature_columns=None, target_column=None, step_size=1):
    """
    Generate sliding windows for a given range of data.
    - input_steps: Number of timesteps in the input window.
    - output_steps: Number of timesteps to predict.
    - feature_columns: List of feature column names.
    - target_column: Name of the target column.
    - step_size: Shift between consecutive windows.
    """
    X, y, X_indices, y_indices = [], [], [], []
    for i in range(0, len(data) - input_steps - output_steps + 1, step_size):
        # input: feature columns over the input window
        X_window = data[feature_columns].iloc[i:i+input_steps]
        X.append(X_window)
        X_indices.append(data.index[i:i+input_steps])  #store corresponding indices
        
        # target: target column for the output window
        y_window = data[target_column].iloc[i+input_steps:i+input_steps+output_steps]
        y.append(y_window)
        y_indices.append(data.index[i+input_steps:i+input_steps+output_steps])  #store corresponding indices
    
    print("Generated sliding windows - X.size:", len(X), "y.size:", len(y))
    return X, y, X_indices, y_indices

In [None]:
#define train test sizes
train_size = 20 * 24 * 4  # 20 days in 15 minutes
val_size = 5 * 24 * 4    # 5 days in 15 minutes
test_size = 5 * 24 * 4    # 5 days in 15 minutes

# generate train-test splits dynamically, jump by test_size forward between splits
splits = []
for i in range(0, len(df_data) - train_size - test_size + 1, train_size + val_size + test_size):#if all splits are used for training, there can't be
    #overlap, so jump train_size + test_size
    train_data = df_data.iloc[i:i+train_size]
    val_data = df_data.iloc[i+train_size:i+train_size+val_size]
    test_data = df_data.iloc[i+train_size+val_size:i+train_size+val_size+test_size]
    splits.append((train_data, val_data ,test_data))
print("number of splits: ", len(splits))

In [None]:
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tf.random.set_seed(1234)
def build_model(input_steps, feature_count):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_steps, feature_count)),
        tf.keras.layers.LSTM(128, activation='tanh', return_sequences=True),
        tf.keras.layers.LSTM(64, activation='tanh', return_sequences=True),
        tf.keras.layers.LSTM(32, activation='tanh'),
        tf.keras.layers.Dense(1)  #predicting one timestep
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=['mae'])
    return model

In [None]:
#data split

#storage for training, validation, and test data
all_train_X, all_train_X_indices, all_train_y, all_train_y_indices = [], [], [], []
all_val_X, all_val_X_indices, all_val_y, all_val_y_indices = [], [], [], []
all_test_X, all_test_X_indices, all_test_y, all_test_y_indices = [], [], [], []

# loop over all folds to collect training data
fold = 0
for fold, (train_data, val_data, test_data) in enumerate(splits):
    print(f"Processing Fold {fold + 1}")

    #generate sliding windows for training
    df_train_X, df_train_y, df_train_X_indices, df_train_y_indices = generate_sliding_window(train_data, input_steps, output_steps, features, target, step_size_train)
    
    #generate sliding windows for validation
    df_val_X, df_val_y, df_val_X_indices, df_val_y_indices = generate_sliding_window(val_data, input_steps, output_steps, features, target, step_size_val)
    
    #generate sliding windows for testing
    df_test_X, df_test_y, df_test_X_indices, df_test_y_indices = generate_sliding_window(test_data, input_steps, output_steps, features, target, step_size_test)

    #append training data
    all_train_X.extend(df[features].values for df in df_train_X)
    all_train_X_indices.extend(df.values for df in df_train_X_indices)
    all_train_y.extend(df.values for df in df_train_y)
    all_train_y_indices.extend(df.values for df in df_train_y_indices)

    #append validation data
    all_val_X.extend(df[features].values for df in df_val_X)
    all_val_X_indices.extend(df.values for df in df_val_X_indices)
    all_val_y.extend(df.values for df in df_val_y)
    all_val_y_indices.extend(df.values for df in df_val_y_indices)

    #append test data
    all_test_X.extend(df[features].values for df in df_test_X)
    all_test_X_indices.extend(df.values for df in df_test_X_indices)
    all_test_y.extend(df.values for df in df_test_y)
    all_test_y_indices.extend(df.values for df in df_test_y_indices)

#convert lists to numpy arrays
all_train_X, all_train_y = np.array(all_train_X), np.array(all_train_y)
all_train_X_indices, all_train_y_indices = np.array(all_train_X_indices), np.array(all_train_y_indices)

all_val_X, all_val_y = np.array(all_val_X), np.array(all_val_y)
all_val_X_indices, all_val_y_indices = np.array(all_val_X_indices), np.array(all_val_y_indices)

all_test_X, all_test_y = np.array(all_test_X), np.array(all_test_y)
all_test_X_indices, all_test_y_indices = np.array(all_test_X_indices), np.array(all_test_y_indices)

In [None]:
all_train_X = all_train_X.astype(np.float32)
all_train_y = all_train_y.astype(np.float32)

all_val_X = all_val_X.astype(np.float32)
all_val_y = all_val_y.astype(np.float32)

all_test_X = all_test_X.astype(np.float32)
all_test_y = all_test_y.astype(np.float32)

In [None]:
print(np.isnan(all_train_X).sum())  # Count NaNs
print(np.isinf(all_train_X).sum())  # Count Infs

# benchmarks

In [None]:
#initialize a dictionary to store benchmark results
benchmark_results = {}

#function to compute and store evaluation metrics
def evaluate_benchmark(name, y_true, y_pred):
    """Computes MAE, RMSE, MAPE, and R² and stores in a dictionary."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    #avoid zero division in MAPE
    valid_mask = y_true != 0  
    mape = np.mean(np.abs((y_true[valid_mask] - y_pred[valid_mask]) / y_true[valid_mask])) * 100  
    r2 = r2_score(y_true, y_pred)
    
    #store results in dictionary
    benchmark_results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape,
        'R²': r2
    }

    #print results
    print(f"  Benchmark: {name}")
    print(f"  Test MAPE: {mape:.2f}%")
    print(f"  Test R²: {r2:.4f}")
    print(f"  Test MAE: {mae:.4f}")
    print(f"  Test RMSE: {rmse:.4f}")
    print("-" * 40)

In [None]:
all_test_y_flat = all_test_y.reshape(-1)  #ensure it's a 1D array
all_test_y_indices_flat = all_test_y_indices.reshape(-1)  #flatten indices to match

df_test = pd.DataFrame({
    "actual": all_test_y_flat,
}, index=pd.Index(all_test_y_indices_flat, name="timestamp"))

df_test

In [None]:
#benchmark 1: the ideal profile
#shift the 'adjusted_P_max' column forward by one day
df_shifted = df_data[['adjusted_P_max']].shift(freq='D')

df_shifted.head()

In [None]:
df_benchmark1 = df_test


df_benchmark1 = df_benchmark1.reset_index()
df_benchmark1['timestamp'] = pd.to_datetime(df_benchmark1['timestamp'])
df_benchmark1['timestamp'] = df_benchmark1['timestamp'].dt.tz_localize('UTC')
df_benchmark1.set_index('timestamp', inplace=True)



df_benchmark1 = df_benchmark1.merge(df_shifted[['adjusted_P_max']], left_index=True, right_index=True, how='left')
df_benchmark1 = df_benchmark1.merge(df_data[['mean_actualPowerTot_W_inverter']], left_index=True, right_index=True, how='left')
df_benchmark1

In [None]:
print(df_benchmark1[['mean_actualPowerTot_W_inverter', 'adjusted_P_max']].isna().sum())
print(df_benchmark1[df_benchmark1['adjusted_P_max'].isna()])
#hour change for summer hour, skip's an hour --> nan's
df_benchmark1['adjusted_P_max'].fillna(0, inplace=True)

In [None]:
evaluate_benchmark("15min_ideal_profile", df_benchmark1['mean_actualPowerTot_W_inverter'] ,df_benchmark1['adjusted_P_max'])

In [None]:
fig_benchmark1 = go.Figure()

fig_benchmark1.add_trace(go.Scatter(x=df_benchmark1.index, y=df_benchmark1['adjusted_P_max'], mode='lines', name='prediction'))
fig_benchmark1.add_trace(go.Scatter(x=df_benchmark1.index, y=df_benchmark1['mean_actualPowerTot_W_inverter'], mode='lines', name='actual'))

#update layout for better visualization
fig_benchmark1.update_layout(
    title='benchmark 1',
    xaxis_title='Time',
    yaxis_title='Mean Actual Power (W)',
    #xaxis_rangeslider_visible=True,
    margin=dict(t=150),  # Increase top margin to fit legend and title
    legend=dict(
        orientation="h",  # horizontal layout
        y=1,           # place it above the plot area
        x=0.5,
        xanchor='center',
        yanchor='bottom'
    ),
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        range=['2022-05-05', '2022-05-10']
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    )
)

# show the plot
fig_benchmark1.show()

In [None]:
#benchmark 2: 1 day shift
df_shifted_day = df_data[['mean_actualPowerTot_W_inverter']].shift(freq='D')
df_shifted_day = df_shifted_day.rename(columns={'mean_actualPowerTot_W_inverter': 'mean_actualPowerTot_W_inverter_shifted_1D'})
df_shifted_day                         

In [None]:
df_benchmark2 = df_test


df_benchmark2 = df_benchmark2.reset_index()
df_benchmark2['timestamp'] = pd.to_datetime(df_benchmark2['timestamp'])
df_benchmark2['timestamp'] = df_benchmark2['timestamp'].dt.tz_localize('UTC')
df_benchmark2.set_index('timestamp', inplace=True)


df_benchmark2 = df_benchmark2.merge(df_shifted_day[['mean_actualPowerTot_W_inverter_shifted_1D']], left_index=True, right_index=True, how='left')
df_benchmark2 = df_benchmark2.merge(df_data[['mean_actualPowerTot_W_inverter']], left_index=True, right_index=True, how='left')
df_benchmark2

In [None]:
print(df_benchmark2[['mean_actualPowerTot_W_inverter_shifted_1D', 'mean_actualPowerTot_W_inverter']].isna().sum())
print(df_benchmark2[df_benchmark2['mean_actualPowerTot_W_inverter_shifted_1D'].isna()])
#hour change for summer hour, skip's an hour --> nan's
df_benchmark2['mean_actualPowerTot_W_inverter_shifted_1D'].fillna(0, inplace=True)

In [None]:
evaluate_benchmark("15min_1day_shift", df_benchmark2['mean_actualPowerTot_W_inverter'] ,df_benchmark2['mean_actualPowerTot_W_inverter_shifted_1D'])

In [None]:
fig_benchmark2 = go.Figure()

fig_benchmark2.add_trace(go.Scatter(x=df_benchmark2.index, y=df_benchmark2['mean_actualPowerTot_W_inverter_shifted_1D'], mode='lines', name='prediction'))
fig_benchmark2.add_trace(go.Scatter(x=df_benchmark2.index, y=df_benchmark2['mean_actualPowerTot_W_inverter'], mode='lines', name='actual'))

#update layout for better visualization
fig_benchmark2.update_layout(
    title='benchmark 2',
    xaxis_title='Time',
    yaxis_title='Mean Actual Power (W)',
    #xaxis_rangeslider_visible=True,
    margin=dict(t=150),  # Increase top margin to fit legend and title
    legend=dict(
        orientation="h",  # horizontal layout
        y=1,           # place it above the plot area
        x=0.5,
        xanchor='center',
        yanchor='bottom'
    ),
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        range=['2022-05-05', '2022-05-10']
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    )
)

fig_benchmark2.show()

In [None]:
#benchmark 3: autoregressive
all_train_y_flat = all_train_y.reshape(-1)  #ensure it's a 1D array
all_train_y_indices_flat = all_train_y_indices.reshape(-1)  #flatten indices to match

df_train = pd.DataFrame({
    "actual": all_train_y_flat,
}, index=pd.Index(all_train_y_indices_flat, name="timestamp"))

df_train

In [None]:
#benchmark 3: autoregressief model

from statsmodels.tsa.ar_model import AutoReg

input_steps = 4*4

#prepare training data
train_series = df_train['actual']
test_series = df_test['actual']

#fit an autoregressive model
model = AutoReg(train_series, lags=input_steps, old_names=False)
model_fit = model.fit()

#print model summary
print(model_fit.summary())

In [None]:
#set the number of hours to predict ahead
prediction_horizon = 24  #number of hours to predict ahead

#initialize lists to store predictions for the whole test set
all_predictions_recursive = []
all_predictions_indices = []

#loop over the entire test set
amount_of_times = 0
for test_idx in range(0, len(all_test_X), 24*4):  #iterate over each test example
    starting_window = all_test_X[test_idx]
    starting_window_indeces = all_test_X_indices[test_idx]
    #print(starting_window_indeces)

    input_window = starting_window
    input_window_indices = starting_window_indeces

    predictions_recursive = []
    predictions_indices = []
    sin_recursive = []
    cos_recursive = []

    for i in range(prediction_horizon * 4):  #total steps in 15minutes intervals
        #predict next minute
        prediction = model_fit.predict(start=len(input_window), end=len(input_window), dynamic=False)
        predictions_recursive.append(prediction[0])

        #update time indices for predictions
        next_index = input_window_indices[-1] + pd.Timedelta(minutes=15)
        predictions_indices.append(next_index)

        #update the input window for the next step (shift window to the left)
        input_window = np.roll(input_window, -1, axis=0)
        
        #update the last element of the window with the prediction
        input_window[-1, 0] = prediction[0]

        #update hour sin and cos
        next_hour = next_index.hour
        input_window[-1, 1] = np.sin(2 * np.pi * next_hour / 24)
        input_window[-1, 2] = np.cos(2 * np.pi * next_hour / 24)

        #append sin and cos values
        sin_recursive.append(input_window[-1, 1])
        cos_recursive.append(input_window[-1, 2])

        #update indices
        input_window_indices = np.roll(input_window_indices, -1, axis=0)
        input_window_indices[-1] = next_index

        # print progress for every 100 steps or at the last step
        if i % 10 == 0 or i == prediction_horizon * 4 - 1:
            print(f"Prediction progress for test set {test_idx + 1}: Step {i + 1} / {prediction_horizon * 4} ({(i + 1) / (prediction_horizon * 4) * 100:.2f}%)")

        
    #convert predictions and indices to numpy arrays
    predictions_recursive = np.array(predictions_recursive)
    predictions_indices = np.array(predictions_indices)

    #store predictions for this test example
    all_predictions_recursive.append(predictions_recursive)
    all_predictions_indices.append(predictions_indices)
    
    amount_of_times += 1
    
    #if(amount_of_times >= 10):
    #    break
    
    
#convert all predictions to arrays for easier handling
all_predictions_recursive = np.array(all_predictions_recursive)
all_predictions_indices = np.array(all_predictions_indices)

print("Prediction completed for all test samples.")


In [None]:
df_results_all = pd.DataFrame({
    "predicted": all_predictions_recursive.reshape(-1)
}, index=pd.Index(all_predictions_indices.reshape(-1), name="timestamp"))

df_results_all

In [None]:
#bring back to normal data
#read in the normalization profile factors

df_results_all = df_results_all.reset_index()
df_results_all['timestamp'] = pd.to_datetime(df_results_all['timestamp'])
df_results_all['timestamp'] = df_results_all['timestamp'].dt.tz_localize('UTC')
df_results_all.set_index('timestamp', inplace=True)


#merge the two DataFrames on the 'time' index
df_merged_all = df_results_all.merge(df_data[['adjusted_P_max', 'mean_actualPowerTot_W_inverter']], left_index=True, right_index=True, how='left')
#lose some data from full_adjusted_df near end since the df_data doesn't have full last day

#check for missing values (NaN) in adjusted_P_max
if df_merged_all['adjusted_P_max'].isna().any():
    print("Warning: Some values are missing in the normalization profile.")
    
    
# currently cut of a part that goes infinite

df_merged_all = df_merged_all[(df_merged_all["predicted"] > 0) & (df_merged_all["predicted"] < 5)]

    
#denormalize the 'mean_actualPowerTot_W_inverter' column by multiplying by the 'adjusted_P_max' column
df_merged_all['denormalized_value_predicted'] = df_merged_all['predicted'] * df_merged_all['adjusted_P_max']


df_merged_all

In [None]:
df_merged_all = df_merged_all.dropna(subset=['adjusted_P_max'])

#verify that there are no more NaNs
nan_count_after_drop = df_merged_all['adjusted_P_max'].isna().sum()
print(f"Number of NaN values in 'adjusted_P_max' after drop: {nan_count_after_drop}")

In [None]:
fig_all_denormalised = go.Figure()

fig_all_denormalised.add_trace(go.Scatter(x=df_merged_all.index, y=df_merged_all['denormalized_value_predicted'], mode='lines', name='predicted'))
fig_all_denormalised.add_trace(go.Scatter(x=df_merged_all.index, y=df_merged_all['mean_actualPowerTot_W_inverter'], mode='lines', name='actual'))

# Update layout for better visualization
fig_all_denormalised.update_layout(
    title='denormalised',
    xaxis_title='Time',
    yaxis_title='Mean Actual Power (W)',
    #xaxis_rangeslider_visible=True,
    margin=dict(t=150),  # Increase top margin to fit legend and title
    legend=dict(
        orientation="h",  # horizontal layout
        y=1,           # place it above the plot area
        x=0.5,
        xanchor='center',
        yanchor='bottom'
    ),
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        range=['2022-05-05', '2022-05-10']
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    )
)

fig_all_denormalised.show()

In [None]:
evaluate_benchmark("15min_autoregressive", df_merged_all['mean_actualPowerTot_W_inverter'] ,df_merged_all['denormalized_value_predicted'])

In [None]:
#function to compute evaluation metrics
def evaluate_benchmark2(y_true, y_pred):
    """Computes MAE, RMSE, MAPE, and R² and stores in a dictionary."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    #avoid zero division in MAPE
    valid_mask = y_true != 0  
    mape = np.mean(np.abs((y_true[valid_mask] - y_pred[valid_mask]) / y_true[valid_mask])) * 100  
    r2 = r2_score(y_true, y_pred)

    results = {
        "MAPE": round(mape, 2),
        "R²": round(r2, 4),
        "MAE": round(mae, 4),
        "RMSE": round(rmse, 4),
    }

    return results

In [None]:
#benchmark 4: average month production

In [None]:
benchmark_results = {}

for days in range(1, 31):
    #create an empty DataFrame to hold the rolling timestamp-specific average
    df_avg_shifted = df_data[['mean_actualPowerTot_W_inverter']].copy()

    #initialize a Series for the 7-day timestamp-specific average
    avg_values = []

    #loop through each timestamp in df_data to compute average of the same time over the past 7 days
    for current_time in df_avg_shifted.index:
        #create list of 7 previous days at the same timestamp
        past_times = [current_time - pd.Timedelta(days=day) for day in range(1, days + 1)]

        #extract the 7 values from those past timestamps
        past_values = [df_avg_shifted.loc[time, 'mean_actualPowerTot_W_inverter']
                       if time in df_avg_shifted.index else np.nan
                       for time in past_times]

        #compute the average
        avg_value = np.nanmean(past_values)

        avg_values.append(avg_value)

    #assign the computed averages
    df_avg_shifted['mean_actualPowerTot_W_inverter_7D_timestamp_avg'] = avg_values

    #shift the entire series forward by 1 day to simulate prediction for next day
    #df_avg_shifted['mean_actualPowerTot_W_inverter_7D_timestamp_avg_shifted'] = (
    #    df_avg_shifted['mean_actualPowerTot_W_inverter_7D_timestamp_avg'].shift(freq='D')
    #)

    #keep only the shifted column
    df_shifted_7d_avg = df_avg_shifted[['mean_actualPowerTot_W_inverter_7D_timestamp_avg']]

    #merge with test data
    df_benchmark4 = df_test.copy()
    df_benchmark4 = df_benchmark4.reset_index()
    df_benchmark4['timestamp'] = pd.to_datetime(df_benchmark4['timestamp'])
    df_benchmark4['timestamp'] = df_benchmark4['timestamp'].dt.tz_localize('UTC')
    df_benchmark4.set_index('timestamp', inplace=True)

    df_benchmark4 = df_benchmark4.merge(df_shifted_7d_avg, left_index=True, right_index=True, how='left')
    df_benchmark4 = df_benchmark4.merge(df_data[['mean_actualPowerTot_W_inverter']], left_index=True, right_index=True, how='left')

    #handle missing values (e.g., early data or daylight saving)
    print(df_benchmark4[['mean_actualPowerTot_W_inverter_7D_timestamp_avg', 'mean_actualPowerTot_W_inverter']].isna().sum())
    df_benchmark4['mean_actualPowerTot_W_inverter_7D_timestamp_avg'].fillna(0, inplace=True)

    #evaluate benchmark
    benchmark_results[days] = evaluate_benchmark2(df_benchmark4['mean_actualPowerTot_W_inverter'], 
                       df_benchmark4['mean_actualPowerTot_W_inverter_7D_timestamp_avg'])

#output all the benchmark results for each days range (1 to 30)
print("Benchmark results for 1 to 30 days averages:")
for days, result in benchmark_results.items():
    print(f"Days: {days}, Result: {result}")

In [None]:
fig_benchmark4 = go.Figure()

fig_benchmark4.add_trace(go.Scatter(x=df_benchmark4.index, y=df_benchmark4['mean_actualPowerTot_W_inverter_7D_timestamp_avg'], mode='lines', name='prediction'))
fig_benchmark4.add_trace(go.Scatter(x=df_benchmark4.index, y=df_benchmark4['mean_actualPowerTot_W_inverter'], mode='lines', name='actual'))

#update layout for better visualization
fig_benchmark4.update_layout(
    title='benchmark 4',
    xaxis_title='Time',
    yaxis_title='Mean Actual Power (W)',
    #xaxis_rangeslider_visible=True,
    margin=dict(t=150),  # Increase top margin to fit legend and title
    legend=dict(
        orientation="h",  # horizontal layout
        y=1,           # place it above the plot area
        x=0.5,
        xanchor='center',
        yanchor='bottom'
    ),
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        range=['2022-05-05', '2022-05-10']
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    )
)

# show the plot
fig_benchmark4.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

#extract the MAE values from the benchmark_results dictionary
mae_values = []
days_range = range(1, 31)

for days in days_range:
    result = benchmark_results.get(days)
    if result:
        mae = result.get('R²', None)
        if mae is not None:
            mae_values.append(mae)
        else:
            mae_values.append(np.nan)
    else:
        mae_values.append(np.nan)

plt.figure(figsize=(10, 6))
plt.plot(days_range, mae_values, marker='o', color='b', linestyle='-', label='MAE')

#adding text labels to each point
for i, txt in enumerate(mae_values):
    if not np.isnan(txt):  #only label valid values
        plt.text(days_range[i], txt, f"{txt:.2f}", fontsize=10, ha='right', va='bottom')

plt.xlabel('Days Back (1 to 30)', fontsize=12)
plt.ylabel('Mean Absolute Percentage Error (MAPE)', fontsize=12)
plt.title('R² for Different Lookback Periods (1 to 30 days)', fontsize=14)
plt.grid(True)
plt.legend()

plt.show()