In [None]:
!hostname

In [None]:
!nvidia-smi

In [None]:
import subprocess
import sys

# Liste der benötigten Bibliotheken
required_packages = [
    "gymnasium", "numpy", "pandas", "joblib", "scikit-learn", "matplotlib",
    "stable-baselines3", "torch", "torchvision", "torchaudio"
]

# Funktion zum Installieren fehlender Bibliotheken
def install_packages(packages):
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            print(f"📦 Installiere {package} ...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Installiere fehlende Pakete
install_packages(required_packages)

In [None]:
import os
print(os.getcwd())  # Gibt den aktuellen Arbeitsordner aus

In [1]:
%run /home/dhbw/environment.ipynb

Notebook ausgeführt


In [2]:
import numpy as np
import pandas as pd
import joblib
import random
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.callbacks import CheckpointCallback
from sklearn.preprocessing import StandardScaler
from stable_baselines3.common.vec_env import DummyVecEnv
import torch

In [3]:
seed = 42
SEED  = seed % (2**32 - 1)
print(f"SEED: {SEED}")

SEED: 42


In [4]:
# -------------------------------
# CSV Datem einlesen
# -------------------------------
train_data = pd.read_csv("/home/dhbw/2023-2018_stand_data.csv")
train_data.drop('datetime', axis=1, inplace=True)

test_data = pd.read_csv("/home/dhbw/2025-2024_stand_data.csv")
test_data.drop('datetime', axis=1, inplace=True)

if(train_data is not None and test_data is not None):
    print("Daten erfolgreich eingelesen")

Daten erfolgreich eingelesen


In [5]:
from stable_baselines3.common.vec_env import SubprocVecEnv

def make_env():
    return TradingEnv(
        data=train_data,
        initial_cash=10_000,
        window_size=336,
        scaler_path="/home/dhbw/scaler.pkl",
        default_seed=SEED
    )

n_envs = 8  # Mehr parallele Umgebungen (8, 16 oder sogar 32 testen!)
env = SubprocVecEnv([make_env for _ in range(n_envs)])

In [6]:
# Test-Umgebung für Evaluation (ohne SubprocVecEnv, da wir nur eine Instanz brauchen)
test_env = TradingEnv(
    data=test_data,  # Oder test_data, falls du separate Test-Daten hast
    initial_cash=10_000,
    window_size=336,
    scaler_path="/home/dhbw/scaler.pkl",
    default_seed=SEED
)

Seed in the environment: 42


In [None]:
import torch

# Prüfen, ob GPU Tensoren richtig verarbeitet
tensor = torch.rand(1000, 1000).to("cuda")
for i in range(100000):
    tensor = tensor @ tensor  # Matrix-Multiplikation, sollte die GPU stark belasten

print("Fertig!")

In [7]:
from stable_baselines3.common.vec_env import VecNormalize

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
env.training = True  # Sicherstellen, dass Normalisierung aktiv ist
env.device = "cuda"  # WICHTIG: Umgebung auf CUDA setzen

In [8]:
from torch import nn  # Für die Netzwerkarchitektur

# Definiere das neuronale Netz
policy_kwargs = dict(
    net_arch=[dict(pi=[128, 128], vf=[128, 128])],  # Zwei Layer mit 128 Neuronen
    activation_fn=nn.ReLU,  # Verwende ReLU als Aktivierungsfunktion
)

# Erstelle den PPO-Agenten mit verbesserten Einstellungen
model = PPO(
    "MlpPolicy",
    env,
    learning_rate=0.0003,  # Lernrate
    gamma=0.99,  # Discount-Faktor
    clip_range=0.2,  # PPO-Clip-Parameter
    ent_coef=0.01,  # Entropie-Koeffizient
    n_steps=32768  ,  # WICHTIG: Mehr Schritte pro Update → GPU-Auslastung steigt
    batch_size=16384  ,  # WICHTIG: Große Batch-Größe → GPU rechnet effizienter
    policy_kwargs=policy_kwargs,
    verbose=1,
    seed=SEED,
    device="cuda",  # Nutzt die GPU!
    #tensorboard_log="./tensorboard_log/"  # Optional: Logging für TensorBoard
)


Using cuda device




In [9]:
print(f"Env läuft auf: {env.device}")  # Sollte CUDA anzeigen
print("Modell läuft auf:", model.device)
print("Model Policy:", model.policy.device)

Env läuft auf: cuda
Modell läuft auf: cuda
Model Policy: cuda:0


In [10]:
print(type(model.policy))
print(hasattr(model.policy, 'to'))

<class 'stable_baselines3.common.policies.ActorCriticPolicy'>
True


In [11]:
model.policy.to("cuda")
print("Policy erfolgreich auf GPU gesetzt:", next(model.policy.parameters()).device)

Policy erfolgreich auf GPU gesetzt: cuda:0


In [12]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# Optional: Checkpoint Callback, falls du Zwischenspeicherungen möchtest
# checkpoint_callback = CheckpointCallback(save_freq=100, save_path='./logs/', name_prefix='ppo_trading')

# -------------------------------
# Trainings- und Test-Performance (Loss) evaluieren und plotten
# -------------------------------
# Wir unterteilen das Training in mehrere Intervalle.
eval_interval = 1000          # Trainingsschritte pro Intervall
total_timesteps = 10000       # Gesamtzahl der Trainingsschritte
n_iterations = total_timesteps // eval_interval

# Listen für Plot-Daten
train_loss_list = []  # Wir definieren Loss als negativen Reward (damit "kleiner" besser ist)
test_loss_list = []
timesteps_list = []

In [13]:
import time

# Trainingsschleife in Intervallen
for i in range(1, n_iterations + 1):
    print(f"\n=== Trainingsiteration {i} von {n_iterations} ===")
    print("Modell läuft auf:", model.device)

    # Debug-Print, um den Typ von model.policy zu prüfen
    print(f"Typ von model.policy: {type(model.policy)}")

    # Modell auf GPU setzen, ohne model.policy zu ersetzen
    model.policy.to("cuda")
    # Nur die forward()-Methode kompilieren, nicht das gesamte Policy-Objekt überschreiben
    model.policy.forward = torch.compile(model.policy.forward)

    # Prüfen, ob das Modell wirklich auf CUDA ist (Policy-Parameter verwenden)
    print("Modell auf Gerät:", next(model.policy.parameters()).device)

    # Teste, ob Stable-Baselines3 wirklich Tensoren auf der GPU erstellt
    test_tensor = torch.randn(10, 10).to("cuda")
    print("GPU-Test-Tensor erstellt:", test_tensor.device)

    # Training um 'eval_interval' Timesteps
    print("Training beginnt.")
    start_time = time.time()
    model.learn(total_timesteps=eval_interval, reset_num_timesteps=False)
    end_time = time.time()
    duration = end_time - start_time
    duration_minutes = duration / 60.0
    print(f"Training abgeschlossen. {i * eval_interval} Timesteps haben {duration:.2f} Sekunden ({duration_minutes:.2f} Minuten) gedauert.")

    # Evaluation auf dem Trainings-Environment (mittlere Reward über 5 Episoden)
    with torch.no_grad():
        # Stelle sicher, dass die Policy auf GPU ist
        model.policy.to("cuda")
        mean_train_reward, _ = evaluate_policy(model, env, n_eval_episodes=5, deterministic=True, render=False)

    # Evaluation auf dem Test-Environment
    with torch.no_grad():
        model.policy.to("cuda")
        mean_test_reward, _ = evaluate_policy(model, test_env, n_eval_episodes=5, deterministic=True, render=False)
    
    # Um den "Loss" zu erhalten, verwenden wir den negativen Reward.
    train_loss = -mean_train_reward
    test_loss = -mean_test_reward
    
    train_loss_list.append(train_loss)
    test_loss_list.append(test_loss)
    timesteps_list.append(i * eval_interval)
    
    print(f"Timesteps: {i * eval_interval} | Train Reward: {mean_train_reward:.2f} (Loss: {train_loss:.2f}) | Test Reward: {mean_test_reward:.2f} (Loss: {test_loss:.2f})")


=== Trainingsiteration 1 von 10 ===
Modell läuft auf: cuda
Typ von model.policy: <class 'stable_baselines3.common.policies.ActorCriticPolicy'>
Modell auf Gerät: cuda:0
GPU-Test-Tensor erstellt: cuda:0
Training beginnt.
-------------------------------
| time/              |        |
|    fps             | 1454   |
|    iterations      | 1      |
|    time_elapsed    | 180    |
|    total_timesteps | 262144 |
-------------------------------
Training abgeschlossen. 1000 Timesteps haben 201.35 Sekunden (3.36 Minuten) gedauert.




Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Timesteps: 1000 | Train Reward: 88.88 (Loss: -88.88) | Test Reward: -3.21 (Loss: 3.21)

=== Trainingsiteration 2 von 10 ===
Modell läuft auf: cuda
Typ von model.policy: <class 'stable_baselines3.common.policies.ActorCriticPolicy'>
Modell auf Gerät: cuda:0
GPU-Test-Tensor erstellt: cuda:0
Training beginnt.
-------------------------------
| time/              |        |
|    fps             | 1476   |
|    iterations      | 1      |
|    time_elapsed    | 177    |
|    total_timesteps | 524288 |
-------------------------------
Training abgeschlossen. 2000 Timesteps haben 198.71 Sekunden (3.31 Minuten) gedauert.
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Timesteps: 2000 | Train Reward: 265.01 (Loss: -

Process ForkServerProcess-3:
Process ForkServerProcess-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dhbw/jupyter-env/lib/python3.12/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 33, in _worker
    cmd, data = remote.recv()
                ^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
          ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 430, in _recv_bytes
    buf = self._recv(4)
          ^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 395, in _recv
    chunk = read(handle, remaining)
            ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
  File "/usr/lib/python3

Seed in the environment: 42
Seed in the environment: 43
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 44
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 46
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environment: 42
Seed in the environm

KeyboardInterrupt: 

In [None]:
# -------------------------------
# Plot: Loss-Kurven (grün: Training, rot: Test)
# -------------------------------
plt.figure(figsize=(10, 6))
plt.plot(timesteps_list, train_loss_list, color='green', label='Train Loss')
plt.plot(timesteps_list, test_loss_list, color='red', label='Test Loss')
plt.xlabel("Timesteps")
plt.ylabel("Loss (negativer Reward)")
plt.title("Train vs. Test Loss Kurven")
plt.legend()
plt.grid()
plt.show()

# -------------------------------
# Trainiertes Modell speichern
# -------------------------------
model.save("ppo_trading_model")

# ==========================================

In [None]:
# -------------------------------
# TradingEnv erstellen (bitte sicherstellen, dass TradingEnv importiert oder im gleichen Skript definiert ist)
# -------------------------------
env = TradingEnv(
    data=train_data,
    initial_cash=10_000,
    window_size=336,
    scaler_path="/home/dhbw/scaler.pkl",
    default_seed=SEED
)

if(env is not None):
    print("Environment created successfully")

In [None]:
# -------------------------------
# PPO-Agenten initialisieren
# -------------------------------
model = PPO(
    "MlpPolicy", 
    env, 
    verbose=1, 
    seed=SEED, 
    device="cuda",
#   tensorboard_log="./tensorboard_log/"
)
print(model.device)  # Sollte "cuda:0" ausgeben, wenn es auf der GPU läuft

# Optional: Checkpoint Callback um den Trainingsfortschritt zwischendurch zu speichern
checkpoint_callback = CheckpointCallback(save_freq=100, save_path='./logs/', name_prefix='ppo_trading')

# -------------------------------
# Training
# -------------------------------
model.learn(
    total_timesteps=100, 
    #callback=checkpoint_callback, 
    log_interval=1
)

# Speichere das trainierte Modell
model.save("ppo_trading_model")

# Backtesting

## Trainigsdaten

In [None]:
# -------------------------------
# Testlauf: Den trainierten Agenten in einer Episode ausführen
# -------------------------------
training_env = env

obs, info = training_env.reset(seed=SEED)
done = False

# Liste der actionen
action_list = []

#while not done:
for i in range(1000):
    # Bestimme die Aktion (deterministisch)
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    obs, reward, done, truncated, info = training_env.step(action)
    action_list.append(action)

# Hier wird der Zustand gerendert (z.B. als Plot). Du kannst den Render-Modus anpassen.
training_env.render(mode='human')
print(action_list)

## Testdaten

In [None]:
# -------------------------------
# Testlauf: Den trainierten Agenten in einer Episode ausführen
# -------------------------------
test_env = TradingEnv(
    data=test_data,
    initial_cash=10_000,
    window_size=336,
    scaler_path="../../Transform_data/scaler.pkl",
    default_seed=SEED
)

obs, info = test_env.reset(seed=SEED)
done = False

# Liste der actionen
action_list = []

#while not done:
for i in range(100):
    # Bestimme die Aktion (deterministisch)
    action, _states = model.predict(obs, deterministic=True)
    action = int(action)  # oder: action = action.item()
    obs, reward, done, truncated, info = test_env.step(action)
    action_list.append(action)

# Hier wird der Zustand gerendert (z.B. als Plot). Du kannst den Render-Modus anpassen.
test_env.render(mode='human')
print(action_list)

# ======

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
#from stable_baselines3.common.callbacks import CheckpointCallback  # Optional, falls benötigt
#from stable_baselines3.common.vec_env import DummyVecEnv


# -------------------------------
# TradingEnv erstellen
# (Stelle sicher, dass TradingEnv bereits importiert oder definiert ist)
# -------------------------------
env = TradingEnv(
    data=train_data,
    initial_cash=10_000,
    window_size=336,
    scaler_path="/home/dhbw/scaler.pkl",
    default_seed=SEED
)

test_env = TradingEnv(
    data=test_data,
    initial_cash=10_000,
    window_size=336,
    scaler_path="/home/dhbw/scaler.pkl",
    default_seed=SEED
)

print("Environments erstellt")

# -------------------------------
# PPO-Agent initialisieren
# -------------------------------
model2 = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    seed=SEED,
    device="cuda",
    #tensorboard_log="./tensorboard_log/"  # Optional: Logging für TensorBoard
)




In [None]:
import torch
print("CUDA verfügbar:", torch.cuda.is_available())
print("Aktuelles Device:", torch.cuda.current_device())
print("Device-Name:", torch.cuda.get_device_name(0))
print("Modell läuft auf:", model.device)

In [None]:
import time

env = make_env()
obs = env.reset()

start_time = time.time()
for _ in range(1000):
    action = env.action_space.sample()
    obs, reward, done, info, _ = env.step(action)
    if done:
        obs = env.reset()
end_time = time.time()

print(f"1000 Schritte dauerten: {end_time - start_time:.2f} Sekunden")