In [1]:
%run ../../Environment/environment.ipynb

e:\Studienarbeit_GitHub\Studienarbeit\Agents\PPO
Notebook ausgeführt


# 1. Bibliotheken importieren

In [2]:
import numpy as np
import pandas as pd
import optuna
import joblib
import random
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import CheckpointCallback
from sklearn.preprocessing import StandardScaler
from stable_baselines3.common.vec_env import DummyVecEnv
import torch
from torch import nn

# 2. Seed setzen

In [3]:
seed = 42
SEED  = seed % (2**32 - 1)
print(f"SEED: {SEED}")

SEED: 42


# 3. Daten einlesen

In [4]:
# -------------------------------
# CSV Datem einlesen
# -------------------------------
train_data = pd.read_csv("../../Transform_data/stand_data/2023-2018_stand_data.csv")
train_data.drop('datetime', axis=1, inplace=True)

test_data = pd.read_csv("../../Transform_data/stand_data/2025-2024_stand_data.csv")
test_data.drop('datetime', axis=1, inplace=True)

if(train_data is not None and test_data is not None):
    print("Daten erfolgreich eingelesen")

Daten erfolgreich eingelesen


# 4. Parallele Umgebungen erstellen für das Training

In [5]:
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

def create_env():
    return TradingEnv(
        data=train_data,
        initial_cash=10_000,
        window_size=336,
        scaler_path="../../Transform_data/scaler.pkl",
        default_seed=SEED
    )

n_envs = 4  # Mehr parallele Umgebungen (8, 16 oder sogar 32 testen!)
env = SubprocVecEnv([create_env for _ in range(n_envs)])

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
env.training = True  # Sicherstellen, dass Normalisierung aktiv ist

# 5. Hyperparameter Evaluierung

In [6]:
# Für Hyperparameter-Tuning nutzen wir test_data als Validierungsdatensatz.
valid_data = test_data.copy()

# -------------------------------
# Environment-Erstellung
# -------------------------------
def make_env(data):
    def _init():
        return TradingEnv(
            data=data,
            initial_cash=10_000,
            window_size=336,
            scaler_path="../../Transform_data/scaler.pkl",
            default_seed=SEED
        )
    return _init

# -------------------------------
# Evaluation Helper Function
# -------------------------------
def evaluate_agent(model, env, n_eval_episodes=5):
    episode_rewards = []
    for _ in range(n_eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        episode_rewards.append(total_reward)
    return np.mean(episode_rewards)

In [7]:
# -------------------------------
# Hyperparameter Tuning with Optuna
# -------------------------------
def objective(trial):
    # Hyperparameter-Sampling
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    gamma = trial.suggest_float("gamma", 0.90, 0.9999)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    n_steps = trial.suggest_categorical("n_steps", [128, 256, 512])  # keine 1024
    ent_coef = trial.suggest_float("ent_coef", 1e-6, 0.01, log=True)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.4)
    gae_lambda = trial.suggest_float("gae_lambda", 0.8, 0.99)

    # Train-Environment mit SubprocVecEnv und VecNormalize
    n_envs = 4
    env_train_raw = SubprocVecEnv([make_env(train_data) for _ in range(n_envs)])
    env_train = VecNormalize(env_train_raw, norm_obs=True, norm_reward=True, clip_obs=10.0)
    env_train.training = True

    # Validation-Environment mit DummyVecEnv (nur 1 Env)
    env_valid_raw = DummyVecEnv([make_env(valid_data)])
    env_valid = VecNormalize(env_valid_raw, norm_obs=True, norm_reward=True, clip_obs=10.0)
    env_valid.training = False
    env_valid.norm_reward = False

    # PPO Agent
    model = PPO(
        "MlpPolicy",
        env_train,
        learning_rate=learning_rate,
        gamma=gamma,
        batch_size=batch_size,
        n_steps=n_steps,
        ent_coef=ent_coef,
        clip_range=clip_range,
        gae_lambda=gae_lambda,
        verbose=0,
        seed=SEED,
        policy_kwargs=dict(
            net_arch=dict(pi=[128, 128], vf=[128, 128]),
            activation_fn=nn.ReLU,
        )
    )

    # Training
    model.learn(total_timesteps=10_000, log_interval=1)

    # Evaluation
    mean_reward = evaluate_agent(model, env_valid, n_eval_episodes=5)

    return mean_reward

In [8]:
# -------------------------------
# Optuna-Optimierung starten
# -------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_trial.params)

[I 2025-04-06 19:59:07,717] A new study created in memory with name: no-name-3c8384b9-49ac-4c5f-911f-eef615846674


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:01:20,897] Trial 0 finished with value: 2.9310214519500732 and parameters: {'learning_rate': 2.5641121461446072e-05, 'gamma': 0.9787420180414022, 'batch_size': 64, 'n_steps': 128, 'ent_coef': 1.472739259713409e-06, 'clip_range': 0.15238102512913426, 'gae_lambda': 0.9786570351310979}. Best is trial 0 with value: 2.9310214519500732.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:03:36,783] Trial 1 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0026318231496464234, 'gamma': 0.9121417756631408, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 1.2544509602022392e-05, 'clip_range': 0.1540633153737754, 'gae_lambda': 0.8479590972875454}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:06:10,000] Trial 2 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.004499712023548036, 'gamma': 0.9173332761277821, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 1.7765289447935076e-05, 'clip_range': 0.24481214598440212, 'gae_lambda': 0.8831159228703745}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:08:09,655] Trial 3 finished with value: 2.968198299407959 and parameters: {'learning_rate': 1.8715559496789196e-05, 'gamma': 0.9422828755400818, 'batch_size': 128, 'n_steps': 512, 'ent_coef': 0.00015434096818301758, 'clip_range': 0.34271089924029924, 'gae_lambda': 0.9199009554852527}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:10:12,177] Trial 4 finished with value: 2.530527114868164 and parameters: {'learning_rate': 0.0002231152683865242, 'gamma': 0.9824259655530623, 'batch_size': 128, 'n_steps': 512, 'ent_coef': 0.0006218085749244544, 'clip_range': 0.13620367108779688, 'gae_lambda': 0.8849080684878003}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:12:06,198] Trial 5 finished with value: 0.014640425331890583 and parameters: {'learning_rate': 0.0011775478260792852, 'gamma': 0.9263723333699962, 'batch_size': 256, 'n_steps': 128, 'ent_coef': 8.423195193074967e-06, 'clip_range': 0.19045755408838605, 'gae_lambda': 0.8897956293018239}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:14:03,823] Trial 6 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 1.2471796650907275e-05, 'gamma': 0.9286195955702644, 'batch_size': 128, 'n_steps': 128, 'ent_coef': 3.445632075917661e-05, 'clip_range': 0.17294614568560385, 'gae_lambda': 0.8911476078720459}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:16:31,037] Trial 7 finished with value: 2.9676411151885986 and parameters: {'learning_rate': 0.006725627631830887, 'gamma': 0.9593517831523087, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.00033047522682535786, 'clip_range': 0.3754197596846286, 'gae_lambda': 0.8198693300768374}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:18:59,765] Trial 8 finished with value: 0.10970045626163483 and parameters: {'learning_rate': 0.001385363971373073, 'gamma': 0.9908761814885763, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 2.1244238816995376e-05, 'clip_range': 0.30685190260377904, 'gae_lambda': 0.8551764800523429}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:21:01,992] Trial 9 finished with value: 0.646863579750061 and parameters: {'learning_rate': 5.0280448760843996e-05, 'gamma': 0.9826576724047187, 'batch_size': 128, 'n_steps': 512, 'ent_coef': 1.026346244207273e-05, 'clip_range': 0.19590494029908612, 'gae_lambda': 0.9298149297124719}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:23:27,697] Trial 10 finished with value: 1.1415804624557495 and parameters: {'learning_rate': 0.00022376535689516678, 'gamma': 0.901122835365871, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 0.0050005506468407965, 'clip_range': 0.10893941501513285, 'gae_lambda': 0.8230141343121694}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:26:10,058] Trial 11 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.006033829610787992, 'gamma': 0.9015907317280041, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 1.298453135331137e-06, 'clip_range': 0.2638921410788241, 'gae_lambda': 0.8548284133649248}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:28:32,207] Trial 12 finished with value: 2.968198299407959 and parameters: {'learning_rate': 0.0017403188401304665, 'gamma': 0.9193932722874055, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 4.639055544219663e-06, 'clip_range': 0.2449869110968159, 'gae_lambda': 0.849928641868926}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:30:29,377] Trial 13 finished with value: 2.9652585983276367 and parameters: {'learning_rate': 0.002436237491825937, 'gamma': 0.9150744958818118, 'batch_size': 256, 'n_steps': 256, 'ent_coef': 3.880807317596929e-05, 'clip_range': 0.24265004761649947, 'gae_lambda': 0.8033257531159885}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:32:32,247] Trial 14 finished with value: 2.9681944847106934 and parameters: {'learning_rate': 0.0005338593750557636, 'gamma': 0.9444107646954513, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 5.515257584181629e-05, 'clip_range': 0.2981704535914932, 'gae_lambda': 0.9297797568039041}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:34:53,710] Trial 15 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 0.009809753255462244, 'gamma': 0.9579651211849725, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 2.675856596110062e-06, 'clip_range': 0.2212167185549731, 'gae_lambda': 0.8684275339486107}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:36:56,997] Trial 16 finished with value: 2.9680914878845215 and parameters: {'learning_rate': 0.003503163882183801, 'gamma': 0.9119987760998791, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 0.00013477648997457233, 'clip_range': 0.10301317018306416, 'gae_lambda': 0.9568995286881609}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:39:17,606] Trial 17 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0005923216451081773, 'gamma': 0.9310493403714521, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.0013003811028396985, 'clip_range': 0.2952522829653075, 'gae_lambda': 0.837903510747559}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:41:05,588] Trial 18 finished with value: 0.01377163641154766 and parameters: {'learning_rate': 0.0001001609774672876, 'gamma': 0.9379478798488483, 'batch_size': 256, 'n_steps': 256, 'ent_coef': 1.2443274518234372e-05, 'clip_range': 0.22109962350864737, 'gae_lambda': 0.9117380658578145}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:43:08,057] Trial 19 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.003906322264006951, 'gamma': 0.90815069769428, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 4.7639527161893005e-06, 'clip_range': 0.13956885730299246, 'gae_lambda': 0.8712716639017961}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:45:29,472] Trial 20 finished with value: 2.9664487838745117 and parameters: {'learning_rate': 0.0007618751748428752, 'gamma': 0.9554619502869061, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 8.457987765164485e-05, 'clip_range': 0.27300109498842395, 'gae_lambda': 0.8300741994059024}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:47:51,344] Trial 21 finished with value: 0.005205322988331318 and parameters: {'learning_rate': 0.005075229683632593, 'gamma': 0.9005729850536718, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 1.4696232376944321e-06, 'clip_range': 0.26785578661468906, 'gae_lambda': 0.8544940073837207}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:50:11,694] Trial 22 finished with value: 0.0 and parameters: {'learning_rate': 0.008312115957374466, 'gamma': 0.9170711414464395, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 1.208648911062281e-06, 'clip_range': 0.34009075983559645, 'gae_lambda': 0.8683759026677292}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:52:32,553] Trial 23 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.002905735125314139, 'gamma': 0.9069836201331662, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 3.873759687587415e-06, 'clip_range': 0.21604502613689514, 'gae_lambda': 0.8418419143408502}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:54:53,151] Trial 24 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0051674995957138896, 'gamma': 0.9265516539887152, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 1.8052315259405793e-05, 'clip_range': 0.26979889446773286, 'gae_lambda': 0.9065136937817955}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:57:14,148] Trial 25 finished with value: 2.968217134475708 and parameters: {'learning_rate': 0.0012362701534982833, 'gamma': 0.9206692186521679, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 6.8746081379778584e-06, 'clip_range': 0.3280812533600763, 'gae_lambda': 0.8781136710938896}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 20:59:16,321] Trial 26 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0022190062392630724, 'gamma': 0.9065477005116953, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 2.1793895156133645e-06, 'clip_range': 0.17284988770621548, 'gae_lambda': 0.8074180607126702}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:01:14,675] Trial 27 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.005067771639653459, 'gamma': 0.9353452965865809, 'batch_size': 256, 'n_steps': 256, 'ent_coef': 2.0053307464209155e-05, 'clip_range': 0.3920005624883931, 'gae_lambda': 0.8625943735330622}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:04:05,613] Trial 28 finished with value: 2.9678757190704346 and parameters: {'learning_rate': 0.0004167660101073776, 'gamma': 0.971388453516941, 'batch_size': 32, 'n_steps': 512, 'ent_coef': 1.0116548017030543e-06, 'clip_range': 0.2412908958496473, 'gae_lambda': 0.8998563405402077}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:06:18,305] Trial 29 finished with value: 2.9088590145111084 and parameters: {'learning_rate': 0.0009503845669280962, 'gamma': 0.9217208797019403, 'batch_size': 64, 'n_steps': 128, 'ent_coef': 2.0599339687566087e-06, 'clip_range': 0.15739967194557786, 'gae_lambda': 0.9529198925402758}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:08:20,524] Trial 30 finished with value: 2.968217134475708 and parameters: {'learning_rate': 0.0020937809765009846, 'gamma': 0.9121310549926691, 'batch_size': 64, 'n_steps': 128, 'ent_coef': 7.425434987926803e-05, 'clip_range': 0.2015914632012604, 'gae_lambda': 0.8430132584349243}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:10:42,138] Trial 31 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0006011578880457148, 'gamma': 0.9314464006540591, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.002115251692044771, 'clip_range': 0.29495034301081247, 'gae_lambda': 0.8334706614230303}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:12:59,595] Trial 32 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.00012978042250230852, 'gamma': 0.9041637933785269, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.0009320482693288449, 'clip_range': 0.31619359613176407, 'gae_lambda': 0.8151968197033053}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:15:17,944] Trial 33 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.00361565400603009, 'gamma': 0.9473487108122831, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.009873885236035395, 'clip_range': 0.36233797544226415, 'gae_lambda': 0.836343103029641}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:17:08,866] Trial 34 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0002945921769236891, 'gamma': 0.9237289548438415, 'batch_size': 128, 'n_steps': 128, 'ent_coef': 0.00014788530142693931, 'clip_range': 0.2652303385540081, 'gae_lambda': 0.8790023522120174}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:19:29,282] Trial 35 finished with value: 2.967740058898926 and parameters: {'learning_rate': 0.006536350089205032, 'gamma': 0.9117687180735002, 'batch_size': 32, 'n_steps': 512, 'ent_coef': 0.0003925538551580069, 'clip_range': 0.285739439205685, 'gae_lambda': 0.8543866710985538}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:21:23,144] Trial 36 finished with value: 1.201308012008667 and parameters: {'learning_rate': 0.0013965167800663782, 'gamma': 0.9333409130467135, 'batch_size': 128, 'n_steps': 128, 'ent_coef': 0.0015215336344771959, 'clip_range': 0.12414399448075533, 'gae_lambda': 0.884355690855486}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:23:40,655] Trial 37 finished with value: 2.968217134475708 and parameters: {'learning_rate': 0.0008948606392905729, 'gamma': 0.9402952469769489, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 0.00021044279022720158, 'clip_range': 0.31836675248570534, 'gae_lambda': 0.987393150270538}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:25:26,512] Trial 38 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 3.1852016614729774e-05, 'gamma': 0.9271404854720248, 'batch_size': 256, 'n_steps': 128, 'ent_coef': 3.767494985016857e-05, 'clip_range': 0.35973020649500814, 'gae_lambda': 0.8951930934822605}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:27:57,457] Trial 39 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 0.009940298873129798, 'gamma': 0.917672337146942, 'batch_size': 32, 'n_steps': 512, 'ent_coef': 1.3879260690541916e-05, 'clip_range': 0.16888943113870547, 'gae_lambda': 0.8615741840891851}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:29:49,455] Trial 40 finished with value: 2.96819806098938 and parameters: {'learning_rate': 0.00013808553675527866, 'gamma': 0.9106746500361418, 'batch_size': 128, 'n_steps': 256, 'ent_coef': 0.00026576203640944345, 'clip_range': 0.25371723422349945, 'gae_lambda': 0.8244694198840533}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:31:53,131] Trial 41 finished with value: 2.966846227645874 and parameters: {'learning_rate': 0.003510905947948657, 'gamma': 0.9061694514519919, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 7.2121282604034075e-06, 'clip_range': 0.13050980622866826, 'gae_lambda': 0.870183043348153}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:33:53,295] Trial 42 finished with value: 2.968195915222168 and parameters: {'learning_rate': 0.004909025904270617, 'gamma': 0.9012660335378486, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 4.008964028957614e-06, 'clip_range': 0.1438516983469397, 'gae_lambda': 0.8459020855629414}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:35:57,654] Trial 43 finished with value: 2.96366286277771 and parameters: {'learning_rate': 0.001579384709253325, 'gamma': 0.9091314605402151, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 6.401899984211594e-06, 'clip_range': 0.1513463437294796, 'gae_lambda': 0.8751782925404068}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:38:08,231] Trial 44 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 0.006901943183399691, 'gamma': 0.914677540917494, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 2.8481900758130693e-05, 'clip_range': 0.1857479930001208, 'gae_lambda': 0.8599286763094662}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:40:09,201] Trial 45 finished with value: 2.9682915210723877 and parameters: {'learning_rate': 0.0027580623152548677, 'gamma': 0.9234409040696626, 'batch_size': 64, 'n_steps': 512, 'ent_coef': 2.8438526236375848e-06, 'clip_range': 0.11629139749710607, 'gae_lambda': 0.8908311248048891}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:42:16,091] Trial 46 finished with value: 0.0 and parameters: {'learning_rate': 0.004130486104811497, 'gamma': 0.9165766365693785, 'batch_size': 64, 'n_steps': 128, 'ent_coef': 1.008190816096552e-05, 'clip_range': 0.28886701812490123, 'gae_lambda': 0.8145208145650116}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:44:41,149] Trial 47 finished with value: 2.968216896057129 and parameters: {'learning_rate': 0.0018703704040841547, 'gamma': 0.9978536146331979, 'batch_size': 32, 'n_steps': 256, 'ent_coef': 5.002433035013547e-06, 'clip_range': 0.23319060429061306, 'gae_lambda': 0.849384490048853}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:46:30,738] Trial 48 finished with value: 0.0 and parameters: {'learning_rate': 0.006268993671996071, 'gamma': 0.9000374168969688, 'batch_size': 256, 'n_steps': 256, 'ent_coef': 1.6189846336731218e-06, 'clip_range': 0.2118313724736177, 'gae_lambda': 0.8850930703706829}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42


[I 2025-04-06 21:48:56,252] Trial 49 finished with value: 2.9680628776550293 and parameters: {'learning_rate': 1.0161288381546387e-05, 'gamma': 0.9664916228616379, 'batch_size': 32, 'n_steps': 128, 'ent_coef': 5.2432697315807034e-05, 'clip_range': 0.1852565658739072, 'gae_lambda': 0.8286965570074128}. Best is trial 1 with value: 2.9682915210723877.


Seed in the environment: 42
Best hyperparameters: {'learning_rate': 0.0026318231496464234, 'gamma': 0.9121417756631408, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 1.2544509602022392e-05, 'clip_range': 0.1540633153737754, 'gae_lambda': 0.8479590972875454}


Best hyperparameters: {'learning_rate': 0.0026318231496464234, 'gamma': 0.9121417756631408, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 1.2544509602022392e-05, 'clip_range': 0.1540633153737754, 'gae_lambda': 0.8479590972875454}

# 6. Erstellen des Agenten

In [9]:
# Erstelle den PPO-Agenten ohne Hyperparametern
model_without = PPO("MlpPolicy", env, seed=SEED, verbose=1)

Using cpu device


In [14]:
# Mittleres Neuronales Netz
policy_kwargs = dict(
    net_arch=[dict(pi=[128, 128], vf=[128, 128])],  # Zwei Layer mit 128 Neuronen
    activation_fn=nn.ReLU,  # Verwende ReLU als Aktivierungsfunktion
)

# Erstelle den PPO-Agenten mit Optuna-Hyperparametern
model_optuna = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0026318231496464234,
    gamma=0.9121417756631408,
    batch_size=64,
    n_steps=256,
    ent_coef=1.2544509602022392e-05,
    clip_range=0.1540633153737754,
    gae_lambda=0.8479590972875454,
    policy_kwargs=policy_kwargs,
    verbose=1,
    seed=SEED,
    device="cuda",
)

Using cpu device


In [15]:
# Erstelle den PPO-Agenten mit einem kleineren Netzwerk und Custom parametern
policy_kwargs_small = dict(
    net_arch=[dict(pi=[64, 64], vf=[64, 64])],         # Einfaches Netz
    activation_fn=nn.Tanh,     # Testweise Tanh statt ReLU
)

model_custom_small = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0003,         # Standard bei SB3
    gamma=0.95,                   # Etwas kürzerer Zeithorizont
    batch_size=64,
    n_steps=512,                  # Etwas mehr Kontext
    ent_coef=0.01,                # Höhere Entropiestrafe → mehr Exploration
    clip_range=0.2,
    gae_lambda=0.92,
    policy_kwargs=policy_kwargs_small,
    verbose=1,
    seed=SEED,
    device="cuda",
)

Using cpu device


In [16]:
# Erstelle den PPO-Agenten mit einem größeren Netzwerk und Custom parametern
policy_kwargs_deep = dict(
    net_arch=[dict(pi=[256, 256, 128], vf=[256, 256, 128])],  # Tieferes Netz
    activation_fn=nn.ReLU,
)

model_custom_deep = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0001,        # Sehr vorsichtige Lernrate
    gamma=0.99,
    batch_size=128,
    n_steps=1024,                # Längere Rollouts
    ent_coef=0.0001,             # Wenig Exploration
    clip_range=0.25,
    gae_lambda=0.95,
    policy_kwargs=policy_kwargs_deep,
    verbose=1,
    seed=SEED,
    device="cuda",
)

Using cpu device


# 7. Modell trainieren und speichern

In [None]:
model_without.learn(total_timesteps=200_000)
model_without.save("PPO_Model_Without_200K")

-----------------------------
| time/              |      |
|    fps             | 949  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 8192 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 501         |
|    iterations           | 2           |
|    time_elapsed         | 32          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.119216524 |
|    clip_fraction        | 0.626       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | 0.0557      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.109      |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 0.00788     |
-----------------------------------------
----------------------------------

In [None]:
model_optuna.learn(total_timesteps=200_000)
model_optuna.save("PPO_Model_Optuna_200K")

In [None]:
model_custom_small.learn(total_timesteps=200_000)
model_custom_small.save("PPO_Model_Custom_Small_200K")

In [None]:
model_custom_deep.learn(total_timesteps=200_000)
model_custom_deep.save("PPO_Model_Custom_Deep_200K")