In [2]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snsa
import ipywidgets as widgets
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
# Load Training Data
data_path = "../data/cache/wind_farm_data/wind_data_train_seq24_pred3_num5_normrobust_minmax_normalize_modetrain_clusternearest.pt"
loaded_data = torch.load(data_path)
input_sequences = loaded_data['input_sequences_tensor'].numpy()
ground_truth = loaded_data['ground_truth_tensor'].numpy()
input_sequences = input_sequences[:, :-3, :]

KeyboardInterrupt: 

In [None]:
# Load Test Data
test_data_path = "../data/cache/wind_farm_data/wind_data_test_seq24_pred3_num5_normrobust_minmax_normalize_modetest_clusternearest.pt"
loaded_test_data = torch.load(test_data_path)
input_sequences_test = loaded_test_data['input_sequences_tensor'].numpy()
ground_truth_test = loaded_test_data['ground_truth_tensor'].numpy()
input_sequences_test = input_sequences_test[:, :-3, :]

In [None]:
# Load Weather Data for Training
weather_data_path = "../data/cache/weather/wind_data_train_seq24_pred3_num5_normrobust_minmax_normalize_modetrain_clusternearest_temp2m_rh2m_wind100m_winddir100m.pt"
loaded_weather_train = torch.load(weather_data_path)
weather_features_train = loaded_weather_train['weather_data_tensor'].numpy()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
# Load Weather Data for Test
weather_data_test_path = "../data/cache/weather/wind_data_test_seq24_pred3_num5_normrobust_minmax_normalize_modetest_clusternearest_temp2m_rh2m_wind100m_winddir100m.pt"
loaded_weather_test = torch.load(weather_data_test_path)
weather_features_test = loaded_weather_test['weather_data_tensor'].numpy()

In [None]:
# Load Spatial Data for Training
spatial_data_path = "../data/cache/spatial/train_seq24_pred3_num5_clusternearest.pt"
spatial_data = torch.load(spatial_data_path)
correlation = spatial_data['correlation'].numpy()
distance = spatial_data['distance'].numpy()

In [None]:
# Load Spatial Data for Test
spatial_data_test_path = "../data/cache/spatial/test_seq24_pred3_num5_clusternearest.pt"
spatial_data_test = torch.load(spatial_data_test_path)
correlation_test = spatial_data_test['correlation'].numpy()
distance_test = spatial_data_test['distance'].numpy()

In [None]:
print(input_sequences.shape)
print(ground_truth.shape)
print(input_sequences_test.shape)
print(ground_truth_test.shape)
print(weather_features_train.shape)
print(weather_features_test.shape)
print(correlation.shape)
print(distance.shape)
print(correlation_test.shape)
print(distance_test.shape)

NameError: name 'input_sequences' is not defined

In [None]:
# Flatten the weather data for training and testing
weather_features_train_flat = weather_features_train.reshape(weather_features_train.shape[0], -1)
weather_features_test_flat = weather_features_test.reshape(weather_features_test.shape[0], -1)

In [None]:
# Prepare Training and Testing Data
X_flat = input_sequences.reshape(input_sequences.shape[0], -1)
X_test_flat = input_sequences_test.reshape(input_sequences_test.shape[0], -1)


In [None]:
y_train_flat = ground_truth.reshape(ground_truth.shape[0], -1)
y_test_flat = ground_truth_test.reshape(ground_truth_test.shape[0], -1)
y_train_flat = np.array(y_train_flat)
y_test_flat = np.array(y_test_flat)

In [None]:
print(weather_features_train_flat.shape)
print(weather_features_test_flat.shape)
print(X_flat.shape)
print(X_test_flat.shape)
print(correlation.shape)
print(distance.shape)
print(correlation_test.shape)
print(distance_test.shape)

(192401, 540)
(66410, 540)
(192401, 120)
(66410, 120)
(192401, 10)
(192401, 10)
(66410, 10)
(66410, 10)


In [None]:
X_augmented_train = np.concatenate([X_flat, weather_features_train_flat, correlation, distance], axis=1)
X_augmented_test = np.concatenate([X_test_flat, weather_features_test_flat, correlation_test, distance_test], axis=1)

In [None]:
train_data = lgb.Dataset(X_augmented_train, label=y_train_flat)
test_data = lgb.Dataset(X_augmented_test, label=y_test_flat)

In [None]:
# Define your model parameters
params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2", "l1"},
    "num_leaves": 128,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
}

In [4]:
from sklearn.model_selection import GridSearchCV
import time

# Define your model
lgb_model = lgb.LGBMRegressor()

# Setup a parameter grid to explore
param_grid = {
    'learning_rate': [0.01, 0.05],
    'n_estimators': [100, 200],
    'num_leaves': [31, 51],
    'feature_fraction': [0.8, 0.9],
    'bagging_fraction': [0.7, 0.9],
    'bagging_freq': [5, 9]
}

start_time = time.time()
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

model = MultiOutputRegressor(grid_search)

model.fit(X_augmented_train, y_train_flat)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Grid search took {elapsed_time:.2f} seconds.")

best_params = model.estimator.best_params_
print("Best parameters found: ", best_params)

y_pred = model.predict(X_augmented_test)

mse_values = [mean_squared_error(y_test_flat[i], y_pred[i]) for i in range(3)]
mae_values = [mean_absolute_error(y_test_flat[i], y_pred[i]) for i in range(3)]

overall_mse = np.mean(mse_values)
overall_mae = np.mean(mae_values)

error_metrics_df = pd.DataFrame({
    'Metric': ['MSE', 'MAE'],
    'Target 1': [mse_values[0], mae_values[0]],
    'Target 2': [mse_values[1], mae_values[1]],
    'Target 3': [mse_values[2], mae_values[2]],
    'Overall': [overall_mse, overall_mae]
})
print(error_metrics_df)


NameError: name 'X_augmented_train' is not defined

In [None]:
lgb_model = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=100)
model = MultiOutputRegressor(lgb_model)
model.fit(X_augmented_train, y_train_flat)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.356277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 173385
[LightGBM] [Info] Number of data points in the train set: 192401, number of used features: 680
[LightGBM] [Info] Start training from score 0.498758
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.302086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 173385
[LightGBM] [Info] Number of data points in the train set: 192401, number of used features: 680
[LightGBM] [Info] Start training from score 0.502442
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.325555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 173385
[LightGBM] [Info] Number of data points in the train set: 192401, number of used features: 680
[LightGBM] [Inf

In [None]:
y_pred = model.predict(X_augmented_test)

In [None]:
mse_values = []
mae_values = []
rmse_values = []
mbe_values = []
maape_values = []

for i in range(3):
    mse = mean_squared_error(y_test_flat[i], y_pred[i])
    mae = mean_absolute_error(y_test_flat[i], y_pred[i])
    rmse = np.sqrt(mse)
    mbe = np.mean(y_pred[i] - y_test_flat[i])
    maape = np.mean(np.arctan(np.abs((y_test_flat[i] - y_pred[i]) / y_test_flat[i])))
        
    mse_values.append(mse)
    mae_values.append(mae)
    rmse_values.append(rmse)
    mbe_values.append(mbe)
    maape_values.append(maape)

overall_mse = np.mean(mse_values)
overall_mae = np.mean(mae_values)
overall_rmse = np.mean(rmse_values)
overall_mbe = np.mean(mbe_values)
overall_maape = np.mean(maape_values)

error_metrics_df = pd.DataFrame({
    'Metric': ['MSE', 'RMSE', 'MAE', 'MBE', 'MAAPE'],
    'Target 1': [mse_values[0], rmse_values[0], mae_values[0], mbe_values[0], maape_values[0]],
    'Target 2': [mse_values[1], rmse_values[1], mae_values[1], mbe_values[1], maape_values[1]],
    'Target 3': [mse_values[2], rmse_values[2], mae_values[2], mbe_values[2], maape_values[2]],
    'Overall': [overall_mse, overall_rmse, overall_mae, overall_mbe, overall_maape]
})

print(error_metrics_df)

  Metric  Target 1  Target 2  Target 3   Overall
0    MSE  0.060854  0.004654  0.039385  0.034964
1   RMSE  0.246686  0.068221  0.198458  0.171121
2    MAE  0.189209  0.050974  0.140561  0.126915
3    MBE  0.180252  0.007922  0.139162  0.109112
4  MAAPE  0.581898  0.182977  0.762676  0.509184


  maape = np.mean(np.arctan(np.abs((y_test_flat[i] - y_pred[i]) / y_test_flat[i])))
  maape = np.mean(np.arctan(np.abs((y_test_flat[i] - y_pred[i]) / y_test_flat[i])))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import Layout, Button, HBox, VBox
from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import display

sns.set_theme(style="darkgrid", palette="mako")
sns.set_context("talk")

y_pred_reshaped = y_pred.reshape(-1, 3, 5)
y_test_reshaped = y_test_flat.reshape(-1, 3, 5)

# Widget setup
example_index_slider = widgets.IntSlider(value=0, min=0, max=len(y_test_reshaped)-1, step=1, description='Example Index:', readout=True, style={'description_width': 'initial'})
output_plot = widgets.Output()

def plot_results(example_index):
    output_plot.clear_output()
    with output_plot:
        for feature in range(5):  # Assuming 5 features as per your setup
            fig, ax = plt.subplots(figsize=(17, 1.5))
            historical_series = input_sequences[example_index, :, feature]  # Last 24 historical points

            # Plotting historical data
            x_historical_series = list(range(1, 25))
            sns.lineplot(x=x_historical_series, y=historical_series, marker='o', dashes=False, color='#165DB1', ax=ax)

            # Concatenating the last historical point for continuity in the plot
            full_ground_truth_series = np.concatenate([historical_series[-1:], y_test_reshaped[example_index, :, feature]])
            full_prediction_series = np.concatenate([historical_series[-1:], y_pred_reshaped[example_index, :, feature]])

            # Extended x-axis for future predictions
            x_extended_series = list(range(24, 28))

            # Plotting actual vs. predicted values
            sns.lineplot(x=x_extended_series, y=full_ground_truth_series, marker='o', dashes=True, color='#165DB1', ax=ax)
            sns.lineplot(x=x_extended_series, y=full_prediction_series, marker='o', dashes=True, color='#C680BB', ax=ax)

            ax.set_xlabel('')
            ax.set_ylabel('')
            ax.set_xticks([])
            ax.set_yticks([])

            # Calculating and displaying error metrics for each feature
            mae = mean_absolute_error(y_test_reshaped[example_index, :, feature], y_pred_reshaped[example_index, :, feature])
            mse = mean_squared_error(y_test_reshaped[example_index, :, feature], y_pred_reshaped[example_index, :, feature])

            metrics_text = f"Feature {feature+1}\nMAE: {mae:.5f}\nMSE: {mse:.5f}"
            ax.text(1.05, 0.5, metrics_text, transform=ax.transAxes, fontsize=15, verticalalignment='center', bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=1'))

            sns.despine()
            plt.show()

def example_index_changed(change):
    plot_results(change['new'])

example_index_slider.observe(example_index_changed, names='value')

display(example_index_slider, output_plot)
plot_results(example_index_slider.value)


IntSlider(value=0, description='Example Index:', max=66409, style=SliderStyle(description_width='initial'))

Output()