In [None]:
%run ../../Environment/environment_withPortfolio.ipynb
%run ../../Environment/environment_withoutPortfolio.ipynb

In [None]:
import os
import sys

# Move up to the correct project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "..",".."))
sys.path.append(project_root)

print("Updated Python path:", sys.path)  # Debugging check

In [None]:
!jupyter nbconvert --to script DQN_Agent.ipynb

# 1. Bibliotheken importieren

In [None]:
# Standardbibliotheken
import random
from collections import Counter

# Wissenschaftliche Bibliotheken
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Preprocessing & Modellpersistenz
from sklearn.preprocessing import StandardScaler
import joblib

# PyTorch (für benutzerdefinierte Netzwerke)
import torch
from torch import nn

# Reinforcement Learning (Stable Baselines 3)
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import DummyVecEnv

# Gym Umgebung
import gym

# Hyperparameter-Tuning
import optuna

# 2. Daten setzten

In [None]:
seed = 42
SEED  = seed % (2**32 - 1)
print(f"SEED: {SEED}")

INITIAL_CASH = 1

WINDOW_SIZE = 336

SCALER_PATH = "../../Transform_data/scaler.pkl"

#TradingEnv = TradingEnv_withPortfolio
TradingEnv = TradingEnv_withoutPortfolio

# 3. Daten einlesen

In [None]:
# -------------------------------
# CSV Daten einlesen
# -------------------------------
train_data = pd.read_csv("../../Transform_data/stand_data/2023-2018_stand_data.csv")
train_data.drop('datetime', axis=1, inplace=True)

test_data = pd.read_csv("../../Transform_data/stand_data/2025-2024_stand_data.csv")
test_data.drop('datetime', axis=1, inplace=True)

if train_data is not None and test_data is not None:
    print("Daten erfolgreich eingelesen")

# 4. Umgebungen erstellen für das Training

In [None]:
# -------------------------------
# TradingEnv erstellen (Final Environment using train_data)
# -------------------------------
env = TradingEnv(
    data=train_data,
    initial_cash=INITIAL_CASH,
    window_size=WINDOW_SIZE,
    scaler_path=SCALER_PATH,
    default_seed=SEED
)

if env is not None:
    print("Environment created successfully")

# 5. Hyperparameter Evaluierung

In [None]:
# Für Hyperparameter-Tuning nutzen wir test_data als Validierungsdatensatz.
valid_data = test_data.copy()

# -------------------------------
# Evaluation Helper Function
# -------------------------------
def evaluate_agent(model, env, n_eval_episodes=5):
    """
    Evaluate the model over a number of episodes.
    Returns the average cumulative reward.
    """
    episode_rewards = []
    for _ in range(n_eval_episodes):
        # Handle reset return for compatibility with gym vs. gymnasium APIs
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            obs, info = reset_result
        else:
            obs = reset_result
        done = False
        total_reward = 0.0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            step_result = env.step(action)
            # Check length to handle different API outputs
            if len(step_result) == 5:
                obs, reward, done, truncated, info = step_result
            else:
                obs, reward, done, info = step_result
            total_reward += reward
        episode_rewards.append(total_reward)
    return np.mean(episode_rewards)

In [None]:
def objective(trial):
    from stable_baselines3 import DQN
    from stable_baselines3.common.vec_env import DummyVecEnv
    from Environment.environment_withPortfolio import TradingEnv_withPortfolio
    import numpy as np
    import gc

    TradingEnv = TradingEnv_withPortfolio
    try:
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        BASE_DIR = os.getcwd()
    scaler_path = os.path.join(BASE_DIR, '..', '..', 'Transform_data', 'scaler.pkl')

    # === Hyperparameter-Sampling ===
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    gamma = trial.suggest_float("gamma", 0.90, 0.9999)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    net_arch = trial.suggest_categorical("net_arch", [[64, 64], [128, 128], [256, 256]])
    tau = trial.suggest_float("tau", 0.01, 1.0)
    exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5)
    target_update_interval = trial.suggest_int("target_update_interval", 100, 1000)

    policy_kwargs = dict(net_arch=net_arch)

    # === Training- und Validierungsumgebung erstellen ===
    env_train = DummyVecEnv([lambda: TradingEnv(
        data=train_data,
        initial_cash=1,
        window_size=336,
        scaler_path=scaler_path,
        default_seed=SEED
    )])

    env_valid = DummyVecEnv([lambda: TradingEnv(
        data=valid_data,
        initial_cash=1,
        window_size=336,
        scaler_path=scaler_path,
        default_seed=SEED
    )])

    # === Modell erstellen ===
    model = DQN(
        "MlpPolicy",
        env_train,
        learning_rate=learning_rate,
        gamma=gamma,
        batch_size=batch_size,
        tau=tau,
        exploration_fraction=exploration_fraction,
        target_update_interval=target_update_interval,
        buffer_size=10_000,  # Speicherfreundlich
        
        verbose=0,
        seed=SEED,
        policy_kwargs=policy_kwargs
    )

    # === Modell trainieren ===
    model.learn(total_timesteps=50_000)

    # === Evaluation auf Validierungsumgebung ===
    rewards = []
    sharpe_ratios = []

    for _ in range(3):
        obs = env_valid.reset()
        if isinstance(obs, tuple):
            obs, info = obs
        done = False
        total_reward = 0
        portfolio = []

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            result = env_valid.step(action)
            if len(result) == 5:
                obs, reward, done, truncated, info = result
            else:
                obs, reward, done, info = result

            total_reward += reward
            # Sicherstellen, dass portfolio_value im Info dict enthalten ist
            if isinstance(info, dict) and "portfolio_value" in info:
                portfolio.append(info["portfolio_value"])

        rewards.append(total_reward)

        # === Sharpe Ratio berechnen ===
        if len(portfolio) > 2:
            returns = np.diff(portfolio) / portfolio[:-1]
            if returns.std() != 0:
                sharpe = np.mean(returns) / np.std(returns) * np.sqrt(8760)
                sharpe_ratios.append(sharpe)

    # === Score berechnen ===
    avg_reward = np.mean(rewards)
    avg_sharpe = np.mean(sharpe_ratios) if sharpe_ratios else 0.0
    score = avg_reward + avg_sharpe  

    # === Pruning prüfen ===
    trial.report(score, step=0)
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    # === Speicher bereinigen ===
    del model, env_train, env_valid
    gc.collect()

    return score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best hyperparameters:", study.best_trial.params)

# 6. Erstellen des Agenten

In [None]:
# -------------------------------
# DQN-Agenten initialisieren using Best Hyperparameters
# -------------------------------
model = DQN(
    policy="MlpPolicy",
    env=env,
    buffer_size=50000,
    seed=SEED,
    policy_kwargs=dict(activation_fn=nn.ReLU)
)

# 7. Modell trainieren und speichern

In [None]:
# -------------------------------
# Training
# -------------------------------
model.learn(
    total_timesteps=10000, 
    log_interval=1
)

# Speichere das trainierte Modell
model.save("DQN_final_verglich")

In [None]:
# -------------------------------
# Testlauf: Den trainierten Agenten in einer Episode ausführen (Training Environment)
# -------------------------------
training_env = env

reset_result = training_env.reset(seed=SEED)
if isinstance(reset_result, tuple):
    obs, info = reset_result
else:
    obs = reset_result
done = False

# Liste der Aktionen
action_list = []

while not done:
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    step_result = training_env.step(action)
    if len(step_result) == 5:
        obs, reward, done, truncated, info = step_result
    else:
        obs, reward, done, info = step_result
    action_list.append(action)

# Rendern des aktuellen Zustands (z.B. als Plot)
training_env.render(mode='human')
print(action_list)

In [None]:

# -------------------------------
# Plot Action Distribution
# -------------------------------
action_counts = Counter(action_list)
actions = list(range(9))
counts = [action_counts.get(action, 0) for action in actions]

plt.figure(figsize=(8, 5))
plt.bar(actions, counts, tick_label=actions)
plt.xlabel("Action")
plt.ylabel("Frequency")
plt.title("Agent Action Distribution")
plt.grid(axis='y')
plt.show()

In [None]:
# -------------------------------
# Testlauf: Den trainierten Agenten in einer Episode ausführen
# -------------------------------
test_env = TradingEnv_withoutPortfolio(
    data=test_data,
    initial_cash=1,
    window_size=168,
    scaler_path="../../Transform_data/scaler.pkl",
    default_seed=SEED
)

obs, info = test_env.reset(seed=SEED)
done = False

# Liste der actionen
action_list = []

while not done:
    # Bestimme die Aktion (deterministisch)
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    obs, reward, done, truncated, info = test_env.step(action)
    action_list.append(action)

# Hier wird der Zustand gerendert (z.B. als Plot). Du kannst den Render-Modus anpassen.
test_env.render(mode='human')
print(action_list)

In [None]:

# -------------------------------
# Plot Action Distribution
# -------------------------------
action_counts = Counter(action_list)
actions = list(range(9))
counts = [action_counts.get(action, 0) for action in actions]

plt.figure(figsize=(8, 5))
plt.bar(actions, counts, tick_label=actions)
plt.xlabel("Action")
plt.ylabel("Frequency")
plt.title("Agent Action Distribution")
plt.grid(axis='y')
plt.show()

In [None]:
import numpy as np

def compute_sharpe_ratio(portfolio_values, risk_free_rate=0.0, periods_per_year=8760):
    """
    Compute the Sharpe Ratio using the portfolio returns.
    
    Parameters:
    - portfolio_values: List or array of portfolio values over time.
    - risk_free_rate: Annual risk-free rate (default: 0).
    - periods_per_year: Number of periods in one year (default: 8760 for hourly data).
    
    Returns:
    - Sharpe ratio (annualized).
    """
    portfolio_values = np.array(portfolio_values)
    # Calculate period-to-period returns
    returns = np.diff(portfolio_values) / portfolio_values[:-1]
    # Calculate excess returns over the period risk-free rate
    excess_returns = returns - risk_free_rate / periods_per_year
    # Annualize the Sharpe Ratio
    sharpe_ratio = np.mean(excess_returns) / np.std(excess_returns) * np.sqrt(periods_per_year)
    return sharpe_ratio

def compute_max_drawdown(portfolio_values):
    """
    Compute the Maximum Drawdown from the portfolio value history.
    
    Parameters:
    - portfolio_values: List or array of portfolio values over time.
    
    Returns:
    - Maximum drawdown as a negative number (e.g., -0.2 means a 20% drawdown).
    """
    portfolio_values = np.array(portfolio_values)
    cumulative_max = np.maximum.accumulate(portfolio_values)
    drawdowns = (portfolio_values - cumulative_max) / cumulative_max
    max_drawdown = np.min(drawdowns)
    return max_drawdown

def compute_annualized_return(portfolio_values, periods_per_year=8760):
    """
    Compute the annualized return (CAGR) based on the portfolio value history.
    
    Parameters:
    - portfolio_values: List or array of portfolio values over time.
    - periods_per_year: Number of periods in one year.
    
    Returns:
    - Annualized return as a decimal (e.g., 0.12 for 12% per year).
    """
    portfolio_values = np.array(portfolio_values)
    total_periods = len(portfolio_values)
    total_return = portfolio_values[-1] / portfolio_values[0]
    annualized_return = total_return**(periods_per_year / total_periods) - 1
    return annualized_return

def compute_win_loss_rate(portfolio_values):
    """
    Compute the win-loss rate based on the period-to-period returns.
    
    Parameters:
    - portfolio_values: List or array of portfolio values over time.
    
    Returns:
    - A tuple (win_rate, loss_rate) where each value is between 0 and 1.
    """
    portfolio_values = np.array(portfolio_values)
    returns = np.diff(portfolio_values) / portfolio_values[:-1]
    wins = np.sum(returns > 0)
    losses = np.sum(returns <= 0)
    win_rate = wins / (wins + losses) if (wins + losses) > 0 else 0
    loss_rate = 1 - win_rate
    return win_rate, loss_rate

def compute_backtest_metrics(portfolio_values, risk_free_rate=0.0, periods_per_year=8760):
    """
    Compute a set of backtesting metrics: Sharpe Ratio, Maximum Drawdown,
    Annualized Return, and Win-Loss Rate.
    
    Parameters:
    - portfolio_values: List or array of portfolio values over time.
    - risk_free_rate: Annual risk-free rate (default: 0).
    - periods_per_year: Number of periods in one year.
    
    Returns:
    - Dictionary with computed metrics.
    """
    sharpe = compute_sharpe_ratio(portfolio_values, risk_free_rate, periods_per_year)
    max_drawdown = compute_max_drawdown(portfolio_values)
    annualized_return = compute_annualized_return(portfolio_values, periods_per_year)
    win_rate, loss_rate = compute_win_loss_rate(portfolio_values)
    
    return {
        "Sharpe Ratio": sharpe,
        "Maximum Drawdown": max_drawdown,
        "Annualized Return": annualized_return,
        "Win Rate": win_rate,
        "Loss Rate": loss_rate
    }

# Example usage with your environment's portfolio history:
# Assuming you have a TradingEnv instance named 'test_env' that has completed an episode:
metrics = compute_backtest_metrics(test_env.portfolio_value_history, risk_free_rate=0.0, periods_per_year=8760)
print("Backtesting Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")
