# Vergleich von Zeitreihen-ML-Modellen mit Optuna-Hyperparameter-Tuning

Dieses Notebook vergleicht RandomForestRegressor, SupportVectorRegressor und ARIMA auf einem Zeitreihen-Datensatz. Für jedes Modell werden die Hyperparameter mit Optuna optimiert und die Modelle mit TimeSeriesSplit-Crossvalidation bewertet. Die wichtigsten Metriken (RMSE, R², MAPE) werden tabellarisch gegenübergestellt.

In [1]:
# Bibliotheken importieren
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import optuna
from statsmodels.tsa.arima.model import ARIMA
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## 1. Beispieldatensatz erzeugen

In [2]:
# Beispiel-Zeitreihe: Trend + Saisonalität + Rauschen
np.random.seed(42)
n_samples = 200
time = np.arange(n_samples)
y = 0.1 * time + 2 * np.sin(time / 10) + np.random.normal(scale=1, size=n_samples)
X = time.reshape(-1, 1)

## 2. Hilfsfunktionen für Metriken und Crossvalidation

In [3]:
# Hilfsfunktionen für Metriken
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

## 3. Optuna-Hyperparameter-Tuning für die Modelle

In [4]:
# Crossvalidation- und Optuna-Tuning für RandomForestRegressor
def tune_rf(X, y, n_splits=5, n_trials=20):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 2, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "random_state": 42,
        }
        model = RandomForestRegressor(**params)
        scores = []
        for train_idx, test_idx in tscv.split(X):
            model.fit(X[train_idx], y[train_idx])
            y_pred = model.predict(X[test_idx])
            scores.append(rmse(y[test_idx], y_pred))
        return np.mean(scores)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params



# Crossvalidation- und Optuna-Tuning für SVR
def tune_svr(X, y, n_splits=5, n_trials=20):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    def objective(trial):
        params = {
            "C": trial.suggest_loguniform("C", 1e-2, 1e2),
            "epsilon": trial.suggest_loguniform("epsilon", 1e-3, 1.0),
            "kernel": trial.suggest_categorical("kernel", ["rbf", "linear"]),
        }
        model = SVR(**params)
        scores = []
        for train_idx, test_idx in tscv.split(X):
            model.fit(X[train_idx], y[train_idx])
            y_pred = model.predict(X[test_idx])
            scores.append(rmse(y[test_idx], y_pred))
        return np.mean(scores)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params



# Crossvalidation und Optuna-Tuning für ARIMA (univariates Modell)
def tune_arima(y, n_splits=5, n_trials=20):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    def objective(trial):
        p = trial.suggest_int("p", 0, 4)
        d = trial.suggest_int("d", 0, 2)
        q = trial.suggest_int("q", 0, 4)
        scores = []
        for train_idx, test_idx in tscv.split(y):
            y_train, y_test = y[train_idx], y[test_idx]
            try:
                model = ARIMA(y_train, order=(p, d, q)).fit()
                y_pred = model.forecast(steps=len(y_test))
                scores.append(rmse(y_test, y_pred))
            except Exception:
                scores.append(np.inf)
        return np.mean(scores)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

## 4. Modelle evaluieren (TimeSeriesSplit)

In [5]:
# Modelle mit besten Parametern trainieren und Scores berechnen
def evaluate_model(model, X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmses, r2s, mapes = [], [], []
    for train_idx, test_idx in tscv.split(X):
        model.fit(X[train_idx], y[train_idx])
        y_pred = model.predict(X[test_idx])
        rmses.append(rmse(y[test_idx], y_pred))
        r2s.append(r2(y[test_idx], y_pred))
        mapes.append(mape(y[test_idx], y_pred))
    return np.mean(rmses), np.mean(r2s), np.mean(mapes)

def evaluate_arima(y, order, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmses, r2s, mapes = [], [], []
    for train_idx, test_idx in tscv.split(y):
        y_train, y_test = y[train_idx], y[test_idx]
        try:
            model = ARIMA(y_train, order=order).fit()
            y_pred = model.forecast(steps=len(y_test))
            rmses.append(rmse(y_test, y_pred))
            r2s.append(r2(y_test, y_pred))
            mapes.append(mape(y_test, y_pred))
        except Exception:
            rmses.append(np.nan)
            r2s.append(np.nan)
            mapes.append(np.nan)
    return np.nanmean(rmses), np.nanmean(r2s), np.nanmean(mapes)

## 5. Hyperparameter-Tuning und Modellvergleich

In [6]:
rf_params = tune_rf(X, y, n_splits=5, n_trials=20)
svr_params = tune_svr(X, y, n_splits=5, n_trials=20)
arima_params = tune_arima(y, n_splits=5, n_trials=20)

rf = RandomForestRegressor(**rf_params)
svr = SVR(**svr_params)
arima_order = (arima_params['p'], arima_params['d'], arima_params['q'])

rf_scores = evaluate_model(rf, X, y, n_splits=5)
svr_scores = evaluate_model(svr, X, y, n_splits=5)
arima_scores = evaluate_arima(y, arima_order, n_splits=5)

[I 2025-05-26 12:57:07,269] A new study created in memory with name: no-name-3a470124-6035-451a-a244-40441c6e333c
[I 2025-05-26 12:57:07,920] Trial 0 finished with value: 2.977473786552076 and parameters: {'n_estimators': 154, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 2.977473786552076.
[I 2025-05-26 12:57:08,406] Trial 1 finished with value: 2.926398428871103 and parameters: {'n_estimators': 123, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 2.926398428871103.
[I 2025-05-26 12:57:09,309] Trial 2 finished with value: 2.9014837004159757 and parameters: {'n_estimators': 198, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 2 with value: 2.9014837004159757.
[I 2025-05-26 12:57:09,665] Trial 3 finished with value: 2.714903057376528 and parameters: {'n_estimators': 87, 'max_depth': 19, 'min_samp

## 6. Ergebnisse als DataFrame

In [7]:
# Ergebnisse in DataFrame
results = pd.DataFrame({
    "RandomForest": rf_scores,
    "SVR": svr_scores,
    "ARIMA": arima_scores
}, index=["RMSE", "R2", "MAPE"])
results

Unnamed: 0,RandomForest,SVR,ARIMA
RMSE,2.506029,2.105226,2.091506
R2,-1.818571,-2.422599,-2.55984
MAPE,0.242748,0.296997,0.242234


**Hinweis:** Die Ausführung kann je nach Hardware und n_trials einige Minuten dauern.