In [260]:
# !pip install gymnasium
# !pip install "gymnasium[atari, accept-rom-license]"
# !apt-get install -y swig
# !pip install gymnasium[box2d]

In [261]:
# ===========================
# IMPORTS & GLOBAL SETUP
# ===========================
import os
import time
import json
import csv
import random
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import multiprocessing as mp
from collections import deque
from datetime import datetime

import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

SEED = 42

In [262]:
env = gym.make('LunarLander-v2')
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
number_actions = env.action_space.n
print('State shape: ', state_shape)
print('State size: ', state_size)
print('Number of actions: ', number_actions)

State shape:  (8,)
State size:  8
Number of actions:  4


In [263]:
import warnings
import logging

# -----------------------
# WARNINGS UNTERDR√úCKEN
# -----------------------
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Gym / MoviePy Logging leiser machen
logging.getLogger("gymnasium").setLevel(logging.ERROR)
logging.getLogger("moviepy").setLevel(logging.ERROR)

# SDL / Pygame Headless Fix (verhindert XDG_RUNTIME_DIR Meldung)
os.environ["SDL_AUDIODRIVER"] = "dummy"
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [264]:
class QNetwork(nn.Module):
    """
    Dynamisches MLP f√ºr DQN.
    Linear + ReLU nur zwischen Hidden-Layern
    """
    def __init__(self, state_dim, action_dim, layer_sizes=[64, 8, 64, 64]):
        super().__init__()
        torch.manual_seed(42)

        layers = []
        input_size = state_dim
        for i, size in enumerate(layer_sizes):
            layers.append(nn.Linear(input_size, size))
            # ReLU nur nach Hidden-Layern, nicht nach dem letzten Layer
            if i != len(layer_sizes) - 1:
                layers.append(nn.ReLU())
            input_size = size

        # Output Layer
        layers.append(nn.Linear(input_size, action_dim))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [265]:
class ReplayBuffer:
    """
    Speichert vergangene Erfahrungen:
    (state, action, reward, next_state, done)

    Warum?
    - bricht zeitliche Korrelationen
    - alte Erfahrungen k√∂nnen mehrfach gelernt werden
    """

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def add(self, experience):
        """
        F√ºgt eine Erfahrung hinzu.
        Wenn der Speicher voll ist ‚Üí die √§lteste fliegt raus.
        """
        self.memory.append(experience)
        if len(self.memory) > self.capacity:
            self.memory.pop(0)

    def sample(self, batch_size):
        """
        Zieht zuf√§llige Erfahrungen f√ºr ein Lern-Update
        """
        batch = random.sample(self.memory, batch_size)

        states      = torch.tensor(np.vstack([e[0] for e in batch]), dtype=torch.float32).to(self.device)
        actions     = torch.tensor(np.vstack([e[1] for e in batch]), dtype=torch.long).to(self.device)
        rewards     = torch.tensor(np.vstack([e[2] for e in batch]), dtype=torch.float32).to(self.device)
        next_states = torch.tensor(np.vstack([e[3] for e in batch]), dtype=torch.float32).to(self.device)
        dones       = torch.tensor(np.vstack([e[4] for e in batch]), dtype=torch.float32).to(self.device)

        return states, actions, rewards, next_states, dones


In [266]:
class DQNAgent:
    """
    DQN-Agent mit dynamischem QNetwork (beliebige Layer-Architektur)
    """

    def __init__(
        self,
        state_dim,
        action_dim,
        learning_rate,
        buffer_size,
        batch_size,
        gamma,
        tau,
        layer_info=None,  # <-- Neue Option f√ºr dynamische Architekturen
    ):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma          # Discount-Faktor
        self.tau = tau              # Soft-Update-Faktor
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.seed = SEED

        # Standard-Layer falls nichts angegeben
        if layer_info is None:
            layer_info = [64, 64, 64]

        # Online-Netzwerk
        self.q_network = QNetwork(state_dim, action_dim, layer_info).to(self.device)

        # Target-Netzwerk
        self.target_q_network = QNetwork(state_dim, action_dim, layer_info).to(self.device)
        self.target_q_network.load_state_dict(self.q_network.state_dict())  # initial synchronisieren

        # Optimizer
        self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=learning_rate)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(buffer_size)

        self.step_counter = 0

    def select_action(self, state, epsilon):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)

        self.q_network.eval()
        with torch.no_grad():
            q_values = self.q_network(state)
        self.q_network.train()

        if random.random() > epsilon:
            return q_values.argmax(dim=1).item()
        else:
            return random.randrange(self.action_dim)

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add((state, action, reward, next_state, done))
        self.step_counter += 1

        # Nicht bei jedem Schritt lernen (stabiler)
        if self.step_counter % 4 == 0:
            if len(self.replay_buffer.memory) >= self.batch_size:
                batch = self.replay_buffer.sample(self.batch_size)
                self.learn(batch)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            next_q_values = self.target_q_network(next_states).max(dim=1, keepdim=True)[0]
            q_targets = rewards + self.gamma * next_q_values * (1 - dones)

        q_expected = self.q_network(states).gather(1, actions)

        loss = F.mse_loss(q_expected, q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update()

    def soft_update(self):
        for target_param, local_param in zip(
            self.target_q_network.parameters(),
            self.q_network.parameters()
        ):
            target_param.data.copy_(
                self.tau * local_param.data +
                (1.0 - self.tau) * target_param.data
            )


In [274]:
def get_layer_info(agent_or_model):
    """
    Gibt nur die Gr√∂√üe der Linear-Layer zur√ºck, z.B. [64, 64]
    """
    if hasattr(agent_or_model, "q_network"):
        model = agent_or_model.q_network
    else:
        model = agent_or_model

    layer_sizes = []
    for layer in model.modules():
        if isinstance(layer, nn.Linear) and layer.out_features != model.model[-1].out_features:
            # Letzter Linear-Layer (Output) ignorieren
            layer_sizes.append(layer.out_features)
    return layer_sizes

# -------------------------------
# load_model_bundle: rekonstruiert Linear+ReLU Modell
# -------------------------------
def save_model_bundle(model, save_dir, state_dim, action_dim, fitness=None, generation=None):
    os.makedirs(save_dir, exist_ok=True)
    # 1Ô∏è‚É£ Weights speichern
    torch.save(model.state_dict(), os.path.join(save_dir, "model.pth"))

    # 2Ô∏è‚É£ Meta speichern
    meta = {
        "state_dim": int(state_dim),
        "action_dim": int(action_dim),
        "layer_info": layer_info,
        "fitness": float(fitness) if fitness is not None else None,
        "generation": int(generation) if generation is not None else None,
        "seed": int(SEED) if SEED is not None else None,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    with open(os.path.join(save_dir, "meta.json"), "w") as f:
        json.dump(meta, f, indent=4)
    print(f"‚úì Model saved to {save_dir}")

def load_model_bundle(model_dir, device="cpu"):
    # Meta laden
    with open(os.path.join(model_dir, "meta.json"), "r") as f:
        meta = json.load(f)
    state_dim = meta["state_dim"]
    action_dim = meta["action_dim"]
    layer_info = meta["layer_info"]

    # Modell rekonstruieren
    model = QNetwork(state_dim, action_dim, layer_info).to(device)

    # Gewichte laden (reihenfolgebasiert)
    state_dict_saved = torch.load(os.path.join(model_dir, "model.pth"), map_location=device)
    state_dict_model = model.state_dict()
    for k_model, k_saved in zip(state_dict_model.keys(), state_dict_saved.keys()):
        state_dict_model[k_model] = state_dict_saved[k_saved]
    model.load_state_dict(state_dict_model)

    return model, meta

def append_generation_log(
    csv_path,
    env_name,
    generation,
    hidden_sizes,
    fitness,
    episodes_per_individual,
):
    with open(csv_path, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")

        writer.writerow([
            env_name,
            generation,
            str(hidden_sizes),
            f"{fitness:.2f}",
            generation * episodes_per_individual,
        ])


In [268]:
learning_rate = 5e-4
buffer_size = int(1e5)
batch_size = 100
gamma = 0.99
tau = 1e-3

In [269]:
layer_info = [64, 8, 64, 64]
agent = DQNAgent(
    state_dim=8,
    action_dim=4,
    learning_rate=5e-4,
    buffer_size=int(1e5),
    batch_size=100,
    gamma=0.99,
    tau=1e-3,
    layer_info=layer_info
)

In [270]:
get_layer_info(agent)

[64, 8, 64, 64]

In [271]:
agent.q_network

QNetwork(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=64, bias=True)
    (7): Linear(in_features=64, out_features=4, bias=True)
  )
)

In [279]:
# -----------------------------
# Trainings-Setup
# -----------------------------
num_episodes = 2000
max_steps_per_episode = 1000

epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
epsilon = epsilon_start

scores_window = deque(maxlen=100)

env_name = "LunarLander-v2"
env = gym.make(env_name)

goal = 200.0

In [280]:
# -----------------------------
# Ordner + Log initialisieren
# -----------------------------
run_dir = init_training_run(env_name)
log_csv = init_training_log(run_dir)

# -----------------------------
# Trainingsloop
# -----------------------------
for episode in range(1, num_episodes + 1):
    reset_out = env.reset(seed=episode)
    state = reset_out[0] if isinstance(reset_out, tuple) else reset_out
    episode_reward = 0

    for t in range(max_steps_per_episode):
        # Aktion ausw√§hlen (Œµ-greedy)
        action = agent.select_action(state, epsilon)

        # Schritt in der Umwelt
        step_out = env.step(action)
        if len(step_out) == 5:
            next_state, reward, terminated, truncated, _ = step_out
            done = terminated or truncated
        else:
            next_state, reward, done, _ = step_out

        # Erfahrung speichern + ggf. lernen
        agent.step(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            break

    # -----------------------------
    # Statistik & Epsilon decay
    # -----------------------------
    scores_window.append(episode_reward)
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    # Live-Ausgabe
    print(
        f"\rEpisode {episode}\t"
        f"Average Score: {np.mean(scores_window):.2f}",
        end=""
    )

    if episode % 100 == 0:
        print(
            f"\rEpisode {episode}\t"
            f"Average Score: {np.mean(scores_window):.2f}"
        )

    # -----------------------------
    # Modell speichern + Log updaten
    # -----------------------------
    if episode % 100 == 0 or np.mean(scores_window) >= goal:
        save_dir = os.path.join(run_dir, f"checkpoint_episode_{episode}")
        save_model_bundle(
            model=agent.q_network,
            save_dir=save_dir,
            state_dim=env.observation_space.shape[0],
            action_dim=env.action_space.n,
            fitness=np.mean(scores_window),
            generation=episode // 100
        )

        # Layer-Info korrekt abrufen
        hidden_layer_info = get_layer_info(agent)  # [(64,'Linear'), (64,'ReLU'), ...]
        append_generation_log(
            csv_path=log_csv,
            env_name=env_name,
            generation=episode // 100,
            hidden_sizes=hidden_layer_info,
            fitness=np.mean(scores_window),
            episodes_per_individual=episode
        )

    # Stoppen, wenn Environment gel√∂st
    if np.mean(scores_window) >= goal:
        print(
            f"\nüéâ Environment solved in {episode - 100} episodes!"
            f"\tAverage Score: {np.mean(scores_window):.2f}"
        )
        break

env.close()

üìÅ Neuer Trainingslauf: training_history/LunarLander-v2_2026-01-05_19-41-35
üìä Training-Log erstellt: training_history/LunarLander-v2_2026-01-05_19-41-35/training_log.csv
Episode 100	Average Score: -122.90
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_100
Episode 200	Average Score: -49.726
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_200
Episode 300	Average Score: -48.41
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_300
Episode 400	Average Score: 51.422
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_400
Episode 500	Average Score: 124.06
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_500
Episode 600	Average Score: 144.87
‚úì Model saved to training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_600
Episode 700	Average Score: 132.07
‚úì Model saved to tr

In [281]:
def run_trained_model_and_record(
    model_dir,
    env_name="CartPole-v1",
    video_dir="videos",
    max_steps=500,
    device="cpu"
):
    import os, gym, torch
    from gym.wrappers import RecordVideo

    os.makedirs(video_dir, exist_ok=True)

    # --------
    # Modell + Metadaten laden
    # --------
    model, meta = load_model_bundle(model_dir, device=device)
    model.eval()
    print(f"Lade Modell aus '{model_dir}' mit Layer-Info: {meta['layer_info']}")

    # --------
    # ENV vorbereiten + Video-Wrapper
    # --------
    env = gym.make(env_name, render_mode="rgb_array")
    env = RecordVideo(
        env,
        video_folder=video_dir,
        episode_trigger=lambda ep: True,  # jedes Episode aufnehmen
        name_prefix="episode"
    )

    reset_out = env.reset(seed=42)
    state = reset_out[0] if isinstance(reset_out, tuple) else reset_out
    total_reward = 0

    # --------
    # Episode ausf√ºhren
    # --------
    for step in range(max_steps):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(device)

        with torch.no_grad():
            q_values = model(state_t)
            action = torch.argmax(q_values, dim=-1).item()

        step_out = env.step(action)
        if len(step_out) == 5:
            state, reward, terminated, truncated, _ = step_out
            done = terminated or truncated
        else:
            state, reward, done, _ = step_out

        total_reward += reward
        if done:
            break

    env.close()

    print(f"Episode L√§nge: {step+1}")
    print(f"Return: {total_reward}")
    print(f"Video gespeichert in: {video_dir}")


In [282]:
# Angenommen, das Modell wurde gespeichert in:
model_dir = "training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_902"

# Video ausf√ºhren und speichern
run_trained_model_and_record(
    model_dir=model_dir,
    env_name=env_name,                 # aus deinem Setup
    video_dir="videos",                # Ordner f√ºr Video
    max_steps=max_steps_per_episode,   # aus deinem Setup
    device="cpu"                       # falls GPU nicht verf√ºgbar
)


Lade Modell aus 'training_history/LunarLander-v2_2026-01-05_19-41-35/checkpoint_episode_902' mit Layer-Info: [64, 8, 64, 64]
Episode L√§nge: 404
Return: 229.35854514061919
Video gespeichert in: videos


In [None]:
# !rm -rf checkpoints/
# !rm -rf training_history/
# !rm -rf videos/

In [None]:
model, meta = load_model_bundle(model_dir, device="cpu")

In [None]:
agent.q_network

In [None]:
model1 = QNetwork(8, 4, [64, 8, 64, 64]).to("cpu")

In [None]:
model1

In [None]:
state_dict_saved = torch.load(os.path.join("training_history/LunarLander-v2_2026-01-05_18-58-20/checkpoint_episode_168", "model.pth"), map_location="cpu")
model1.load_state_dict(state_dict_saved)