## Import Libaries

In [1]:

# ## Import Libaries


import time

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, recall_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from utils.dl_helper_functions import (
    create_sequences,
    load_picture_lagged_data,
    scale_data,
)
from xgboost import XGBRegressor

HORIZON = 24 * 3 # 3 days of forecast
INITIAL_TRAINING_SIZE = 24 * 183   # 6 months of data = 4392 h
SEQUENCE_LENGTH = 24  # 1 day of data
DTYPE_NUMPY = np.float32  # Use float32 for numpy arrays
n_jobs = -1  # Use all available CPU cores for parallel processing


# # Load Data
X, y_lagged, y, common_time = load_picture_lagged_data(return_common_time=True, verbose=False, grid_size=5, n_jobs=n_jobs, dtype=DTYPE_NUMPY, pca=True)


Interpolating pressure_msl: 100%|██████████| 20184/20184 [00:17<00:00, 1146.65it/s]
Interpolating surface_pressure: 100%|██████████| 20184/20184 [00:04<00:00, 4944.00it/s]
Interpolating wind_gusts_10m: 100%|██████████| 20184/20184 [00:03<00:00, 5142.02it/s]
Interpolating wind_u: 100%|██████████| 20184/20184 [00:03<00:00, 5327.08it/s]
Interpolating wind_v: 100%|██████████| 20184/20184 [00:04<00:00, 4914.09it/s]
100%|██████████| 20161/20161 [00:02<00:00, 9351.95it/s]
100%|██████████| 20161/20161 [00:02<00:00, 9859.40it/s] 
100%|██████████| 20161/20161 [00:02<00:00, 9133.35it/s]
100%|██████████| 20161/20161 [00:02<00:00, 9750.01it/s] 
100%|██████████| 20161/20161 [00:02<00:00, 9201.82it/s]
100%|██████████| 20161/20161 [00:02<00:00, 9273.01it/s] 
100%|██████████| 20161/20161 [00:02<00:00, 9933.11it/s] 
100%|██████████| 20161/20161 [00:02<00:00, 9077.65it/s] 


                 time  bottomT_PC_1  sla_PC_1  sla_PC_2   so_PC_1   so_PC_2  \
0 2022-12-09 22:00:00      0.634133  0.925309 -0.457366 -0.100705 -0.897528   
1 2022-12-09 23:00:00      0.637294  0.940802 -0.442214 -0.111268 -0.912123   
2 2022-12-10 00:00:00      0.639464  0.945689 -0.414826 -0.122708 -0.920838   
3 2022-12-10 01:00:00      0.638936  0.965772 -0.330261 -0.133696 -0.925385   
4 2022-12-10 02:00:00      0.641412  0.952780 -0.323189 -0.143205 -0.932978   

    so_PC_3  sob_PC_1  sob_PC_2  thetao_PC_1   uo_PC_1   uo_PC_2   uo_PC_3  \
0  2.029672  0.280786 -0.082470    -0.553418  0.653585 -0.046777  0.145592   
1  2.036069  0.293755 -0.104575    -0.554901  0.482343 -0.024300  0.008832   
2  2.038312  0.307333 -0.125272    -0.556919  0.236408 -0.041806 -0.088139   
3  2.038564  0.321025 -0.144447    -0.555702  0.027601 -0.015676 -0.083613   
4  2.051886  0.334548 -0.161840    -0.557908 -0.064479  0.086782 -0.048168   

    uo_PC_4   uo_PC_5   vo_PC_1   vo_PC_2   vo_PC_3   vo

In [None]:


# Annahmen (vorab gesetzt)
DTYPE_NUMPY = np.float32
n_jobs = -1

# Daten vorbereiten
X = X.astype(DTYPE_NUMPY)
y_lagged = y_lagged.astype(DTYPE_NUMPY)
y = y.astype(DTYPE_NUMPY)

# Cross-Validation Zeitpunkte
folds = {
    "Surge1": pd.Timestamp("2023-02-25 16:00:00"),
    "Surge2": pd.Timestamp("2023-04-01 09:00:00"),
    "Surge3": pd.Timestamp("2023-10-07 20:00:00"),
    "Surge4": pd.Timestamp("2023-10-20 21:00:00"),
    "Surge5": pd.Timestamp("2024-01-03 01:00:00"),
    "Surge6": pd.Timestamp("2024-02-09 15:00:00"),
    "Surge7": pd.Timestamp("2024-12-09 10:00:00"),
    "normal1": pd.Timestamp("2023-07-01 14:00:00"),
    "normal2": pd.Timestamp("2024-04-01 18:00:00"),
    "normal3": pd.Timestamp("2025-01-01 12:00:00"),
}


def custom_score(y_true=None, y_pred=None, bins=[1, 2.00], alpha=0.7):
    
    # Initialisiere Recall- und MSE-Werte
    recalls = []
    for i in range(y_true.shape[1]):  # Iteriere über jede Spalte
        y_true_class = np.digitize(y_true[:, i], bins=bins)
        y_pred_class = np.digitize(y_pred[:, i], bins=bins)
        recalls.append(recall_score(y_true_class, y_pred_class, average="macro"))
    
    mean_recall = np.mean(recalls)  # Durchschnittlicher Recall
    mse = mean_squared_error(y_true, y_pred)
    return alpha * (1 - mean_recall) + (1 - alpha) * mse


def get_model(name, trial_params=None):
    if trial_params is None:
        trial_params = {}

    if name == "RandomForest":
        return MultiOutputRegressor(RandomForestRegressor(random_state=42, n_jobs=n_jobs, **trial_params), n_jobs=n_jobs)
    elif name == "SVR":
        return MultiOutputRegressor(SVR(**trial_params), n_jobs=n_jobs)
    elif name == "XGBoost":
        return MultiOutputRegressor(XGBRegressor(random_state=42, n_jobs=n_jobs, **trial_params), n_jobs=n_jobs)
    elif name == "LGBM":
        return MultiOutputRegressor(LGBMRegressor(random_state=42, n_jobs=n_jobs, **trial_params), n_jobs=n_jobs)
    elif name == "Linear":
        return MultiOutputRegressor(LinearRegression(n_jobs=n_jobs), n_jobs=n_jobs)
    else:
        raise ValueError(f"Unbekanntes Modell: {name}")


def cross_validation_loop(model_name, folds, X, y_lagged, y, common_time, time_delta, trial_params=None):
    fold_results = []

    for surge_name, fold in folds.items():
        start_cutoff = fold - time_delta
        end_cutoff = fold + time_delta
        idx_start_cutoff = np.where(common_time == start_cutoff)[0][0]
        idx_end_cutoff = np.where(common_time == end_cutoff)[0][0]

        X_test = X[idx_start_cutoff:idx_end_cutoff]
        y_lagged_test = y_lagged[idx_start_cutoff:idx_end_cutoff]
        y_test = y[idx_start_cutoff:idx_end_cutoff]

        X_train = X.copy()
        y_lagged_train = y_lagged.copy()
        y_train = y.copy()

        X_train[idx_start_cutoff:idx_end_cutoff] = np.nan
        y_lagged_train[idx_start_cutoff:idx_end_cutoff] = np.nan
        y_train[idx_start_cutoff:idx_end_cutoff] = np.nan

        X_train, y_lagged_train, y_train = create_sequences(X_train, y_lagged_train, y_train, SEQUENCE_LENGTH, 24)
        X_test, y_lagged_test, y_test = create_sequences(X_test, y_lagged_test, y_test, SEQUENCE_LENGTH, 24)

        gap = 168
        X_test = X_test[gap:-gap]
        y_lagged_test = y_lagged_test[gap:-gap]
        y_test = y_test[gap:-gap]

        scaler_X = StandardScaler()
        scaler_y = StandardScaler()

        
        data = scale_data(scaler_X, scaler_y,
                          X_train, y_lagged_train, y_train,
                          None, None, None,
                          X_test, y_lagged_test, y_test,
                          dtype=DTYPE_NUMPY, verbose=False)

        X_train, y_lagged_train, y_train, _, _, _, X_test, y_lagged_test, y_test = data

        X_train = np.hstack([X_train.reshape(X_train.shape[0], -1), y_lagged_train.reshape(y_lagged_train.shape[0], -1)])
        X_test = np.hstack([X_test.reshape(X_test.shape[0], -1), y_lagged_test.reshape(y_lagged_test.shape[0], -1)])

        model = get_model(model_name, trial_params=trial_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        score = custom_score(y_test, y_pred)
        fold_results.append(score)

    return fold_results


def objective(trial: optuna.trial.Trial):
    

    if model_name == "RandomForest":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 100),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]), 
        }
    elif model_name == "SVR":
        params = {
            "C": trial.suggest_loguniform("C", 0.01, 50, log=True),
            "epsilon": trial.suggest_loguniform("epsilon", 0.01, 1.0, log=True),
            "kernel": trial.suggest_categorical("kernel", ["rbf", "linear", "poly"]),
            "degree": trial.suggest_int("degree", 1, 2) if trial.suggest_categorical("kernel", ["rbf", "linear", "poly"]) == "poly" else None,
            "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]) if trial.suggest_categorical("kernel", ["rbf", "linear", "poly"]) == "rbf" else None,
        }
    elif model_name == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.1, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
            "verbosity": 0
        }
    elif model_name == "LGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 100),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.9, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 20, 100),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.0, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 15.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 15.0),
            "verbosity": -1
        }
    else:
        params = {}

    scores = cross_validation_loop(model_name, folds, X, y_lagged, y, common_time, pd.Timedelta(hours=168 * 4), params)
    score = np.mean(scores)
    print(f"Finished trial with model {model_name} and params {params}, score: {score}")
    return score



model_name = "Linear"  # z.B. "SVR", "XGBoost", etc.
storage = f"sqlite:///Versuch3_{model_name}.db"  # SQLite-Datenbank für Optuna
n_trials = 2  # Anzahl der Versuche für die Hyperparameter-Optimierung
# Optuna Study starten
study = optuna.create_study(direction="minimize", study_name=f"{model_name}", storage=storage, load_if_exists=True)
study.optimize(objective, n_trials=n_trials, n_jobs=1)

print("Beste Parameter:", study.best_params)
print("Bester Score:", study.best_value)


[I 2025-07-05 16:37:29,589] A new study created in RDB with name: Linear
[I 2025-07-05 16:39:11,259] Trial 0 finished with value: 0.2395572236324822 and parameters: {}. Best is trial 0 with value: 0.2395572236324822.


Finished trial with model Linear and params {}, score: 0.2395572236324822


[I 2025-07-05 16:39:13,018] Trial 1 finished with value: 0.2395572236324822 and parameters: {}. Best is trial 0 with value: 0.2395572236324822.


Finished trial with model Linear and params {}, score: 0.2395572236324822
Beste Parameter: {}
Bester Score: 0.2395572236324822
