In [None]:
%run ../../Environment/environment_withPortfolio.ipynb
%run ../../Environment/environment_withoutPortfolio.ipynb

In [None]:
import gym
import optuna
import numpy as np
import pandas as pd
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import CheckpointCallback
from sklearn.preprocessing import StandardScaler
from stable_baselines3.common.vec_env import DummyVecEnv

In [None]:
seed = 42
SEED  = seed % (2**32 - 1)
print(f"SEED: {SEED}")

INITIAL_CASH = 1

WINDOW_SIZE = 336

SCALER_PATH = "../../Transform_data/scaler.pkl"

#TradingEnv = TradingEnv_withPortfolio
TradingEnv = TradingEnv_withoutPortfolio

# CSV Datem einlesen


In [None]:

train_data = pd.read_csv("../../Transform_data/stand_data/2023-2018_stand_data.csv")
train_data.drop('datetime', axis=1, inplace=True)

test_data = pd.read_csv("../../Transform_data/stand_data/2025-2024_stand_data.csv")
test_data.drop('datetime', axis=1, inplace=True)

print("✅ Trainings- und Testdaten erfolgreich geladen.")

# TradingEnv erstellen 

In [None]:
env = TradingEnv(
    data=train_data,
    initial_cash=INITIAL_CASH,
    window_size=WINDOW_SIZE,
    scaler_path="../../Transform_data/scaler.pkl",
    default_seed=SEED
)

print("✅ Environment erfolgreich erstellt.")

Optuna


In [None]:


# Für Hyperparameter-Tuning nutzen wir test_data als Validierungsdatensatz.
valid_data = test_data.copy()

# -------------------------------
# Evaluation Helper Function
# -------------------------------
def evaluate_agent(model, env, n_eval_episodes=5):
    episode_rewards = []
    for _ in range(n_eval_episodes):
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            obs, info = reset_result
        else:
            obs = reset_result
        done = False
        total_reward = 0.0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            step_result = env.step(action)
            if len(step_result) == 5:
                obs, reward, done, truncated, info = step_result
            else:
                obs, reward, done, info = step_result
            total_reward += reward
        episode_rewards.append(total_reward)
    return np.mean(episode_rewards)

# -------------------------------
# Hyperparameter Tuning with Optuna (für A2C)
# -------------------------------
def objective(trial):
    # Sample hyperparameters for A2C
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    gamma = trial.suggest_float("gamma", 0.90, 0.9999)
    n_steps = trial.suggest_categorical("n_steps", [5, 10, 16, 32, 64])
    ent_coef = trial.suggest_float("ent_coef", 1e-6, 0.01, log=True)
    vf_coef = trial.suggest_float("vf_coef", 0.1, 1.0)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 1.0)
    gae_lambda = trial.suggest_float("gae_lambda", 0.8, 0.99)

    # Trainingsumgebung
    env_train = DummyVecEnv([lambda: TradingEnv(
        data=train_data,
        initial_cash=INITIAL_CASH,
        window_size=WINDOW_SIZE,
        scaler_path="../../Transform_data/scaler.pkl",
        default_seed=SEED
    )])

    # Validierungsumgebung
    env_valid = DummyVecEnv([lambda: TradingEnv(
        data=valid_data,
        initial_cash=INITIAL_CASH,
        window_size=WINDOW_SIZE,
        scaler_path="../../Transform_data/scaler.pkl",
        default_seed=SEED
    )])

    # A2C Modell
    model = A2C(
        "MlpPolicy",
        env_train,
        learning_rate=learning_rate,
        gamma=gamma,
        n_steps=n_steps,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        gae_lambda=gae_lambda,
        verbose=0,
        seed=SEED,
    )

    # Training
    model.learn(total_timesteps=10000, log_interval=1)

    # Evaluation
    mean_reward = evaluate_agent(model, env_valid, n_eval_episodes=5)
    return mean_reward

# Optuna-Studie starten
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=50)
#print("Best hyperparameters:", study.best_trial.params)



In [None]:
version = 2  # <---- Version Auswählen


if version == 1:
    # Modell 1 – Standard A2C-Agent (Baseline)
    model = A2C("MlpPolicy", env, verbose=1, seed=SEED)
    model.learn(total_timesteps=100_000)
    model.save("A2C_Model1_100K")

elif version == 2:
    # Modell 2 – Optuna-optimierte Parameter 
    model = A2C("MlpPolicy", env,
                 learning_rate=0.002181000085419467, 
                 gamma=0.9375026778731048,
                 n_steps=5, 
                 ent_coef =4.348163518300396e-06, 
                 vf_coef=0.3865716709856257, 
                 max_grad_norm=0.5566804279536217, 
                 gae_lambda = 0.8321522058182133,
                 verbose=1,
                 seed=SEED)
    model.learn(total_timesteps=100_000)
    model.save("A2C_Model2_100K")

elif version == 3:
    # Modell 3 – Agent soll mehr ausprobieren
    model = A2C("MlpPolicy",
                env,
                seed=SEED,
                learning_rate=0.0007,           # moderat
                n_steps=5,                      # kurze Entscheidungsspanne → mehr Feedback
                gamma=0.99,
                gae_lambda=0.95,
                ent_coef=0.05,                  # sehr hohe Entropie → erkundet mehr
                vf_coef=0.5,
                max_grad_norm=0.5,
                use_rms_prop=True,
                normalize_advantage=True,
                verbose=1)
    model.learn(total_timesteps=100_000)
    model.save("A2C_Model3_100K")

elif version == 4:
    # Modell 4 – Weniger chaotisches Verhalten – Fokus auf stabile Policy-Updates.
    model = A2C("MlpPolicy",
                env,
                seed=SEED,
                learning_rate=0.0001,           # deutlich niedriger
                n_steps=20,                     # längere Rollouts
                gamma=0.95,                     # konservativere Gewichtung zukünftiger Rewards
                gae_lambda=0.9,
                ent_coef=0.0001,                # fast kein Exploration-Drang
                vf_coef=0.25,
                max_grad_norm=0.3,
                use_rms_prop=True,
                normalize_advantage=True,
                verbose=1)
    model.learn(total_timesteps=100_000)
    model.save("A2C_Model4_100K")

else:
    raise ValueError("Ungültige Agenten-Version: nur 1 bis 4 erlaubt")

# Backtesting

## Trainigsdaten

In [None]:
training_env = env

obs, info = training_env.reset(seed=SEED)
done = False

# Liste der actionen
action_list = []

while not done:
    # Bestimme die Aktion (deterministisch)
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    obs, reward, done, truncated, info = training_env.step(action)
    action_list.append(action)

# Hier wird der Zustand gerendert (z.B. als Plot). Du kannst den Render-Modus anpassen.
training_env.render(mode='human')
print(action_list)

## Testdaten

In [None]:
test_env = TradingEnv(
    data=test_data,
    initial_cash=INITIAL_CASH,
    window_size=WINDOW_SIZE,
    scaler_path="../../Transform_data/scaler.pkl",
    default_seed=SEED
)

obs, info = test_env.reset(seed=SEED)
done = False

# Liste der actionen
action_list = []

while not done:
    # Bestimme die Aktion (deterministisch)
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    obs, reward, done, truncated, info = test_env.step(action)
    action_list.append(action)

# Hier wird der Zustand gerendert (z.B. als Plot). Du kannst den Render-Modus anpassen.
test_env.render(mode='human')
print(action_list)