In [None]:
%run ../../Environment/environment_withPortfolio.ipynb
%run ../../Environment/environment_withoutPortfolio.ipynb

# 1. Bibliotheken importieren

In [None]:
# Standardbibliotheken
import random
from collections import Counter

# Wissenschaftliche Bibliotheken
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Preprocessing & Modellpersistenz
from sklearn.preprocessing import StandardScaler
import joblib

# PyTorch (für benutzerdefinierte Netzwerke)
import torch
from torch import nn

# Reinforcement Learning (Stable Baselines 3)
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import DummyVecEnv

# Gym Umgebung
import gym

# Hyperparameter-Tuning
import optuna

# 2. Daten setzten

In [None]:
seed = 42
SEED  = seed % (2**32 - 1)
print(f"SEED: {SEED}")

INITIAL_CASH = 1

WINDOW_SIZE = 336

SCALER_PATH = "../../Transform_data/scaler.pkl"

#TradingEnv = TradingEnv_withPortfolio
TradingEnv = TradingEnv_withoutPortfolio

# 3. Daten einlesen

In [None]:
# -------------------------------
# CSV Datem einlesen
# -------------------------------
train_data = pd.read_csv("../../Transform_data/stand_data/2023-2018_stand_data.csv")
train_data.drop('datetime', axis=1, inplace=True)

test_data = pd.read_csv("../../Transform_data/stand_data/2025-2024_stand_data.csv")
test_data.drop('datetime', axis=1, inplace=True)

if(train_data is not None and test_data is not None):
    print("Daten erfolgreich eingelesen")

# 4. Parallele Umgebungen erstellen für das Training

In [None]:
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

def create_env():
    return TradingEnv(
        data=train_data,
        initial_cash=INITIAL_CASH,
        window_size=WINDOW_SIZE,
        scaler_path=SCALER_PATH,
        default_seed=SEED
    )

n_envs = 4  # Mehr parallele Umgebungen (8, 16 oder sogar 32 testen!)
env = SubprocVecEnv([create_env for _ in range(n_envs)])

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
env.training = True  # Sicherstellen, dass Normalisierung aktiv ist

# 5. Hyperparameter Evaluierung

In [None]:
# Für Hyperparameter-Tuning nutzen wir test_data als Validierungsdatensatz.
valid_data = test_data.copy()

# -------------------------------
# Environment-Erstellung
# -------------------------------
def make_env(data):
    def _init():
        return TradingEnv(
            data=data,
            initial_cash=INITIAL_CASH,
            window_size=WINDOW_SIZE,
            scaler_path=SCALER_PATH,
            default_seed=SEED
        )
    return _init

# -------------------------------
# Evaluation Helper Function
# -------------------------------
def evaluate_agent(model, env, n_eval_episodes=5):
    episode_rewards = []
    for _ in range(n_eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        episode_rewards.append(total_reward)
    return np.mean(episode_rewards)

In [None]:
# -------------------------------
# Hyperparameter Tuning with Optuna
# -------------------------------
def objective(trial):
    # Hyperparameter-Sampling
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    gamma = trial.suggest_float("gamma", 0.90, 0.9999)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    n_steps = trial.suggest_categorical("n_steps", [128, 256, 512])  # keine 1024
    ent_coef = trial.suggest_float("ent_coef", 1e-6, 0.01, log=True)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.4)
    gae_lambda = trial.suggest_float("gae_lambda", 0.8, 0.99)

    # Train-Environment mit SubprocVecEnv und VecNormalize
    n_envs = 4
    env_train_raw = SubprocVecEnv([make_env(train_data) for _ in range(n_envs)])
    env_train = VecNormalize(env_train_raw, norm_obs=True, norm_reward=True, clip_obs=10.0)
    env_train.training = True

    # Validation-Environment mit DummyVecEnv (nur 1 Env)
    env_valid_raw = DummyVecEnv([make_env(valid_data)])
    env_valid = VecNormalize(env_valid_raw, norm_obs=True, norm_reward=True, clip_obs=10.0)
    env_valid.training = False
    env_valid.norm_reward = False

    # PPO Agent
    model = PPO(
        "MlpPolicy",
        env_train,
        learning_rate=learning_rate,
        gamma=gamma,
        batch_size=batch_size,
        n_steps=n_steps,
        ent_coef=ent_coef,
        clip_range=clip_range,
        gae_lambda=gae_lambda,
        verbose=0,
        seed=SEED,
        policy_kwargs=dict(
            net_arch=dict(pi=[128, 128], vf=[128, 128]),
            activation_fn=nn.ReLU,
        )
    )

    # Training
    model.learn(total_timesteps=10_000, log_interval=1)

    # Evaluation
    mean_reward = evaluate_agent(model, env_valid, n_eval_episodes=5)

    return mean_reward

In [None]:
# -------------------------------
# Optuna-Optimierung starten
# -------------------------------
def run_optuna():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    print("Best hyperparameters:", study.best_trial.params)

#run_optuna()

Best hyperparameters: {'learning_rate': 0.004230571749056885, 'gamma': 0.9570686121852459, 'batch_size': 64, 'n_steps': 256, 'ent_coef': 2.143685006303078e-06, 'clip_range': 0.2700826948221078, 'gae_lambda': 0.8247379749162164}

# 6. Erstellen des Agenten

In [None]:
# Erstelle den PPO-Agenten ohne Hyperparametern
model_without = PPO("MlpPolicy", env, seed=SEED, verbose=1)

In [None]:
# Mittleres Neuronales Netz
policy_kwargs = dict(
    net_arch=[dict(pi=[128, 128], vf=[128, 128])],  # Zwei Layer mit 128 Neuronen
    activation_fn=nn.ReLU,  # Verwende ReLU als Aktivierungsfunktion
)

# Erstelle den PPO-Agenten mit den besten Optuna-Hyperparametern
model_optuna = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.004230571749056885,
    gamma=0.9570686121852459,
    batch_size=64,
    n_steps=256,
    ent_coef=2.143685006303078e-06,
    clip_range=0.2700826948221078,
    gae_lambda=0.8247379749162164,
    policy_kwargs=policy_kwargs,
    verbose=1,
    seed=SEED,
    device="cuda",
)

In [None]:
# Erstelle den PPO-Agenten mit einem kleineren Netzwerk und Custom parametern
policy_kwargs_small = dict(
    net_arch=[dict(pi=[64, 64], vf=[64, 64])],         # Einfaches Netz
    activation_fn=nn.Tanh,     # Testweise Tanh statt ReLU
)

model_custom_small = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0003,         # Standard bei SB3
    gamma=0.95,                   # Etwas kürzerer Zeithorizont
    batch_size=64,
    n_steps=512,                  # Etwas mehr Kontext
    ent_coef=0.01,                # Höhere Entropiestrafe → mehr Exploration
    clip_range=0.2,
    gae_lambda=0.92,
    policy_kwargs=policy_kwargs_small,
    verbose=1,
    seed=SEED,
    device="cuda",
)

In [None]:
# Erstelle den PPO-Agenten mit einem größeren Netzwerk und Custom parametern
policy_kwargs_deep = dict(
    net_arch=[dict(pi=[256, 256, 128], vf=[256, 256, 128])],  # Tieferes Netz
    activation_fn=nn.ReLU,
)

model_custom_deep = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0001,        # Sehr vorsichtige Lernrate
    gamma=0.99,
    batch_size=128,
    n_steps=1024,                # Längere Rollouts
    ent_coef=0.0001,             # Wenig Exploration
    clip_range=0.25,
    gae_lambda=0.95,
    policy_kwargs=policy_kwargs_deep,
    verbose=1,
    seed=SEED,
    device="cuda",
)

# 7. Modell trainieren und speichern

In [None]:
model_without.learn(total_timesteps=100_000)
model_without.save("Dueck_Without_1_EUR_200K")

In [None]:
model_optuna.learn(total_timesteps=100_000)
model_optuna.save("Dueck_Without_Optuna_1_EUR_200K")

In [None]:
model_custom_small.learn(total_timesteps=100_000)
model_custom_small.save("Dueck_Without_Custom_Small_1_EUR_200K")

In [None]:
model_custom_deep.learn(total_timesteps=100_000)
model_custom_deep.save("Dueck_Without_Custom_Deep_1_EUR_200K")