# Reinforcement Learning with Imitation Learning and Self-Play

The complete workflow for training a Lunar Lander agent using:
1. Expert DQN training
2. Expert trajectory generation
3. Behavioral Cloning
4. DAgger (Dataset Aggregation)
5. Self-Play enhancement
6. Comprehensive evaluation

We'll be using the LunarLander-v3 environment from Gymnasium.

## 1. Setup and Installation

In [None]:
import zipfile
import os

zip_path = "/content/content.zip"
extract_path = "/"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
!apt-get install -y swig

!git clone https://github.com/openai/box2d-py
%cd box2d-py
!pip install -e .

%cd ..
!pip install gymnasium[box2d] --no-deps

In [None]:
!pip install torch matplotlib numpy tqdm

In [None]:
import os
import torch
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import pickle
from collections import namedtuple, deque
from itertools import count
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

os.makedirs("models", exist_ok=True)

## 2. Define DQN Architecture

First, let's define the neural network architecture for our DQN agent.

In [None]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

## 3. DQN Expert Training

Here we'll train our expert DQN model on the LunarLander environment.

In [None]:
# parameters
num_episodes = 800
batch_size = 128
GAMMA = 0.99
LR = 1e-4
TAU = 0.005

EPSILON = 1.0  # start with full exploration
EPSILON_MIN = 0.01  # minimum value
EPSILON_DECAY = 0.995  # decay factor per episode

reward_list = []
episode_durations = []

# transition namedtuple to store the trajectories
Transition = namedtuple("Transition", ["state", "action", "next_state", "reward", "done"])

# Replay Memory Buffer
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return (
            random.sample(self.memory, batch_size)
            if batch_size < len(self.memory)
            else self.memory
        )

    def __len__(self):
        return len(self.memory)

In [None]:
# initialize the environment
env = gym.make("LunarLander-v3")

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

replay_memory = ReplayMemory(10000)

def select_action(state, epsilon):
    if random.random() < epsilon:
        return torch.tensor(
            [[env.action_space.sample()]], dtype=torch.long, device=device
        )
    else:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)  # Exploit (best action)

optimizer = optim.AdamW(policy_net.parameters(), lr=LR)
criterion = nn.SmoothL1Loss()

In [None]:
# training the expert DQN model
print("Training Expert DQN...")

for episode in tqdm(range(num_episodes), desc="Training Episodes"):
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    total_reward = 0

    for t in count():
        action = select_action(state, EPSILON)
        next_state, reward, terminated, truncated, info = env.step(action.item())

        done = terminated or truncated
        reward = torch.tensor([reward], device=device)
        next_state = torch.tensor(
            next_state, dtype=torch.float32, device=device
        ).unsqueeze(0)
        replay_memory.push(state, action, next_state, reward, done)

        state = next_state
        total_reward += reward.item()

        if len(replay_memory) >= batch_size:
            transitions = replay_memory.sample(batch_size)
            states, actions, next_states, rewards, dones = zip(*transitions)

            states_batch = torch.cat(states)
            next_states_batch = torch.cat(next_states)
            actions_batch = torch.cat(actions)
            rewards = torch.tensor(rewards, device=device)
            dones = torch.tensor(dones, device=device)

            q_target = (
                GAMMA * target_net(next_states_batch).detach().max(1)[0] * ~dones
                + rewards
            )
            q_policy = policy_net(states_batch).gather(1, actions_batch)

            # calculating the Huber loss
            loss = criterion(q_policy, q_target.unsqueeze(1))

            # optimize the model
            optimizer.zero_grad()
            loss.backward()

            # in-place gradient clipping to stabilize training
            torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)

            optimizer.step()

        # update target network
        for target_param, main_param in zip(
            target_net.parameters(), policy_net.parameters()
        ):
            target_param.data.copy_(
                TAU * main_param.data + (1 - TAU) * target_param.data
            )

        if done:
            episode_durations.append(t + 1)
            reward_list.append(total_reward)
            break

    # Decay epsilon
    EPSILON = max(EPSILON_MIN, EPSILON * EPSILON_DECAY)

    if episode % 10 == 0:
        avg_reward = np.mean(reward_list[-10:]) if reward_list else 0
        print(f"Episode {episode}/{num_episodes}, Avg Reward: {avg_reward:.2f}, Epsilon: {EPSILON:.2f}")

torch.save(policy_net.state_dict(), "models/dqn_lunar_lander.pth")
print("Expert DQN model saved successfully!")

1. Reward Trend per Episode

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(reward_list, label="Episode Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("DQN: Episode Reward over Time")
plt.grid(True)
plt.legend()
plt.show()

2. Smoothed Reward Curve (Moving Average)

In [None]:
def moving_average(data, window_size=10):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

plt.figure(figsize=(10, 5))
plt.plot(moving_average(reward_list), label="Smoothed Reward (10-episode MA)", color="orange")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("DQN: Smoothed Episode Rewards")
plt.grid(True)
plt.legend()
plt.show()

3. Epsilon Decay Over Episodes

In [None]:
epsilons = [max(EPSILON_MIN, EPSILON * (EPSILON_DECAY ** i)) for i in range(num_episodes)]

plt.figure(figsize=(10, 5))
plt.plot(epsilons, label="Epsilon Value")
plt.xlabel("Episode")
plt.ylabel("Epsilon")
plt.title("Epsilon Decay Over Time")
plt.grid(True)
plt.legend()
plt.show()

## 4. Generate Expert Trajectories

Let's generate and save expert trajectories using our trained DQN model.

In [None]:
# Saving the videos of the expert model stimulation
video_folder = "./dqn_expert_videos/"
os.makedirs(video_folder, exist_ok=True)

expert_policy = DQN(n_observations, n_actions).to(device)
expert_policy.load_state_dict(torch.load("models/dqn_lunar_lander.pth", map_location=device))
expert_policy.eval()

# configuring environment for video recording
env = gym.make("LunarLander-v3", render_mode="rgb_array")
env = gym.wrappers.RecordVideo(
    env,
    video_folder,
    episode_trigger=lambda ep: ep < 2,  # saving only first 2 episodes
    name_prefix="dqn_expert"
)

# expert trajectories
expert_trajectories = []
n_episodes = 30

print("Generating expert trajectories...")
for episode in range(n_episodes):
    state, _ = env.reset()
    state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    trajectory = {"obs": [], "actions": [], "rewards": []}
    total_reward = 0

    for t in count():
        with torch.no_grad():
            action = expert_policy(state_tensor).argmax(dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)

        trajectory["obs"].append(state)
        trajectory["actions"].append(action)
        trajectory["rewards"].append(reward)

        total_reward += reward
        done = terminated or truncated
        if done:
            break

        state = next_state
        state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    expert_trajectories.append(trajectory)
    print(f"Episode {episode + 1}/{n_episodes} | Total reward: {total_reward:.2f}")

env.close()
print(f"Saved videos to: {video_folder}")

with open('expert_trajectories.pkl', 'wb') as f:
    pickle.dump(expert_trajectories, f)
print("Expert trajectories saved!")

1. Total Reward per Expert Episode

In [None]:
expert_rewards = [sum(traj["rewards"]) for traj in expert_trajectories]

plt.figure(figsize=(8, 5))
plt.bar(range(1, len(expert_rewards) + 1), expert_rewards, color='skyblue')
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Total Reward per Expert Episode")
plt.xticks(range(1, len(expert_rewards) + 1))
plt.grid(axis='y')
plt.show()

2. Action Distribution Over Expert Trajectories

In [None]:
all_actions = [action for traj in expert_trajectories for action in traj["actions"]]
action_counts = [all_actions.count(a) for a in range(n_actions)]

plt.figure(figsize=(8, 5))
plt.bar(range(n_actions), action_counts, color='lightgreen')
plt.xlabel("Action")
plt.ylabel("Frequency")
plt.title("Distribution of Actions Taken by Expert")
plt.xticks(range(n_actions))
plt.grid(axis='y')
plt.show()

3. Cumulative Reward Curve for All Episodes

In [None]:
plt.figure(figsize=(10, 6))
for i, traj in enumerate(expert_trajectories):
    cum_reward = np.cumsum(traj["rewards"])
    plt.plot(cum_reward, label=f"Episode {i+1}")

plt.xlabel("Timestep")
plt.ylabel("Cumulative Reward")
plt.title("Cumulative Reward over Time per Expert Episode")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

4. Reward Density Distribution

In [None]:
all_rewards = [r for traj in expert_trajectories for r in traj["rewards"]]

plt.figure(figsize=(8, 5))
plt.hist(all_rewards, bins=30, color="coral", edgecolor="black", alpha=0.75)
plt.xlabel("Reward")
plt.ylabel("Frequency")
plt.title("Reward Distribution Across All Expert Steps")
plt.grid(True)
plt.tight_layout()
plt.show()

## 5. Behavioral Cloning

Train a student model to mimic the expert's behavior.

In [None]:
with open('expert_trajectories.pkl', 'rb') as f:
    expert_trajectories = pickle.load(f)

loss_history = []

# Student Model (same architecture as DQN but fresh weights)
env = gym.make("LunarLander-v3")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

student_model = DQN(n_observations, n_actions).to(device)
optimizer = optim.Adam(student_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# dataset from expert trajectories
states = []
actions = []
for trajectory in expert_trajectories:
    states.extend(trajectory['obs'])
    actions.extend(trajectory['actions'])

states = torch.tensor(np.array(states), dtype=torch.float32).to(device)
actions = torch.tensor(actions, dtype=torch.long).to(device)

# Training loop
epochs = 200
batch_size = 64
dataset_size = len(states)

print("Training student model with Behavioral Cloning...")
for epoch in tqdm(range(epochs), desc="BC Training"):
    # shuffle data
    indices = torch.randperm(dataset_size)
    total_loss = 0.0
    batches = 0

    for start_idx in range(0, dataset_size, batch_size):
        # getting batch data acc to batch size and indices
        batch_indices = indices[start_idx:start_idx + batch_size]

        state_batch = states[batch_indices]
        action_batch = actions[batch_indices]

        # Forward pass
        logits = student_model(state_batch)
        loss = criterion(logits, action_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batches += 1

    avg_loss = total_loss / batches
    loss_history.append(avg_loss)

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

torch.save(student_model.state_dict(), "models/imitation_model.pth")
print("Behavioral Cloning model saved successfully!")

1. Loss Curve During Behavioral Cloning

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(loss_history, color='royalblue', marker='o')
plt.xlabel("Epoch")
plt.ylabel("Average Cross-Entropy Loss")
plt.title("Student Model Loss Curve (Behavioral Cloning)")
plt.grid(True)
plt.tight_layout()
plt.show()

2. Student vs Expert Action Match (Accuracy Estimate)

In [None]:
with torch.no_grad():
    student_logits = student_model(states)
    student_preds = student_logits.argmax(dim=1)

    correct = (student_preds == actions).sum().item()
    total = len(actions)
    incorrect = total - correct
    accuracy = correct / total * 100

# Enhanced plot
fig, ax = plt.subplots(figsize=(7, 5))

bars = ax.bar(["Correct", "Incorrect"], [correct, incorrect], color=["seagreen", "indianred"], edgecolor='black')

# Add value labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval + total * 0.01, f"{yval}", ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_title(f"Student Model Action Matching\nAccuracy: {accuracy:.2f}%", fontsize=14, weight='bold')
ax.set_ylabel("Number of Actions", fontsize=12)
ax.tick_params(axis='both', labelsize=11)
ax.set_ylim(0, total * 1.1)  # add some space above bars
ax.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

3. Action Distribution: Expert vs Student

In [None]:
# Expert actions
expert_action_counts = [actions.cpu().tolist().count(i) for i in range(n_actions)]

# Student actions
student_action_counts = [student_preds.cpu().tolist().count(i) for i in range(n_actions)]

x = range(n_actions)
bar_width = 0.35

plt.figure(figsize=(8, 5))
plt.bar(x, expert_action_counts, width=bar_width, label='Expert', color='skyblue')
plt.bar([i + bar_width for i in x], student_action_counts, width=bar_width, label='Student', color='orange')
plt.xlabel("Action")
plt.ylabel("Frequency")
plt.title("Action Distribution: Expert vs Student")
plt.xticks([i + bar_width/2 for i in x], x)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 6. DAgger (Dataset Aggregation)

Use DAgger to improve the student model by aggregating additional expert-labeled data.

In [None]:
env = gym.make("LunarLander-v3")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

# Expert
expert_model = DQN(n_observations, n_actions).to(device)
expert_model.load_state_dict(torch.load("models/dqn_lunar_lander.pth", map_location=device))
expert_model.eval()

# Student (initially from behavioral cloning)
student_model = DQN(n_observations, n_actions).to(device)
student_model.load_state_dict(torch.load("models/imitation_model.pth", map_location=device))

with open('dagger_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)
    print("Loaded DAgger dataset")

# DAgger parameters
n_dagger_iterations = 5
n_episodes_per_iteration = 10
epochs_per_iteration = 20
batch_size = 64

optimizer = optim.Adam(student_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
# === Tracking lists ===
losses_per_iteration = []
dataset_sizes = []

# DAgger training loop
print("Starting DAgger training...")
for dagger_iter in range(n_dagger_iterations):
    print(f"\nDAgger Iteration {dagger_iter+1}/{n_dagger_iterations}")

    # Collect data with student policy, but label with expert
    for episode in tqdm(range(n_episodes_per_iteration), desc="Collecting data"):
        state, _ = env.reset()
        done = False

        while not done:
            # Student collects state
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

            # Student takes action
            with torch.no_grad():
                student_action = student_model(state_tensor).argmax(dim=1).item()

            # Expert labels the state
            with torch.no_grad():
                expert_action = expert_model(state_tensor).argmax(dim=1).item()

            # Add to dataset with expert label
            dataset['states'].append(state)
            dataset['actions'].append(expert_action)

            # Execute student's action to get next state
            next_state, _, terminated, truncated, _ = env.step(student_action)
            done = terminated or truncated
            state = next_state

    # Convert to tensors
    states = torch.tensor(np.array(dataset['states']), dtype=torch.float32).to(device)
    actions = torch.tensor(dataset['actions'], dtype=torch.long).to(device)

    # Train student on aggregated dataset
    dataset_size = len(states)
    print(f"Training on aggregated dataset of size {dataset_size}...")
    dataset_sizes.append(dataset_size)

    avg_epoch_losses = []

    for epoch in tqdm(range(epochs_per_iteration), desc="Training"):
        # Shuffle data
        indices = torch.randperm(dataset_size)
        total_loss = 0.0
        batches = 0

        for start_idx in range(0, dataset_size, batch_size):
            # Get batch indices
            batch_indices = indices[start_idx:start_idx + batch_size]

            # Get batch data
            state_batch = states[batch_indices]
            action_batch = actions[batch_indices]

            # Forward pass
            logits = student_model(state_batch)
            loss = criterion(logits, action_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batches += 1

        avg_loss = total_loss / batches
        avg_epoch_losses.append(avg_loss)

        if epoch % 5 == 0:
            avg_loss = total_loss / batches
            print(f"  Epoch {epoch+1}/{epochs_per_iteration}, Loss: {avg_loss:.4f}")

    losses_per_iteration.append(avg_epoch_losses)

    # Save the updated model
    torch.save(student_model.state_dict(), f"models/dagger_model_iter{dagger_iter+1}.pth")

    # Save the dataset
    with open('dagger_dataset.pkl', 'wb') as f:
        pickle.dump(dataset, f)

print("DAgger training complete!")

Plot 1: Loss curves

In [None]:
# plt.figure(figsize=(10, 6))
# for i, losses in enumerate(losses_per_iteration):
#     plt.plot(losses, label=f"DAgger Iter {i+1}")
# plt.title("Student Loss per Epoch During DAgger Iterations", fontsize=14, weight='bold')
# plt.xlabel("Epoch", fontsize=12)
# plt.ylabel("Cross-Entropy Loss", fontsize=12)
# plt.legend()
# plt.grid(True, linestyle='--', alpha=0.5)
# plt.tight_layout()
# plt.show()
plt.figure(figsize=(10, 6))
for i, losses in enumerate(losses_per_iteration):
    plt.plot(losses, label=f"DAgger Iter {i+1}")
plt.title("Student Log Loss per Epoch During DAgger Iterations", fontsize=14, weight='bold')
plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Cross-Entropy Loss (Log Scale)", fontsize=12)
plt.yscale("log")  # Apply log scale here
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5, which='both')
plt.tight_layout()
plt.show()

Plot 2: Dataset growth

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, n_dagger_iterations + 1), dataset_sizes, marker='o', color='teal', linewidth=2)
for i, size in enumerate(dataset_sizes):
    plt.text(i + 1, size, str(size), ha='center', va='bottom', fontsize=10)

plt.title("Aggregated Dataset Size After Each DAgger Iteration", fontsize=14, weight='bold')
plt.xlabel("DAgger Iteration", fontsize=12)
plt.ylabel("Total Samples", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(range(1, n_dagger_iterations + 1))
plt.tight_layout()
plt.show()

## 7. Self-Play Enhancement

Further improve the model through self-play reinforcement learning.

In [None]:
# Parameters
MEMORY_SIZE = 100000
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 0.005
LR = 0.0001
N_ITERATIONS = 50
EPISODES_PER_ITER = 20
ALPHA = 0.7  # Weight for imitation loss
BETA = 0.2   # Weight for policy divergence
GAMMA_REWARD = 0.1  # Weight for reward optimization

# *** Added: container for average loss metrics per iteration
loss_history = []
all_opps = set()

def evaluate_against_opponent(model, opponent_model, env, episodes=5):
    """Evaluate how model performs when competing against an opponent version"""
    model.eval()
    opponent_model.eval()

    # In Lunar Lander single-agent, we simulate "competition" by comparing performance
    # on the same initial states
    total_rewards_model = []
    total_rewards_opponent = []

    for _ in range(episodes):
        # Initialize with same random seed
        seed = random.randint(0, 10000)

        # First let model play
        state, _ = env.reset(seed=seed)
        total_reward = 0
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action = model(state_tensor).argmax(dim=1).item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            state = next_state

        total_rewards_model.append(total_reward)

        # Then let opponent play same scenario
        state, _ = env.reset(seed=seed)
        total_reward = 0
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action = opponent_model(state_tensor).argmax(dim=1).item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            state = next_state

        total_rewards_opponent.append(total_reward)

    model.train()
    win_rate = sum(r_m > r_o for r_m, r_o in zip(total_rewards_model, total_rewards_opponent)) / episodes

    return {
        "model_avg": np.mean(total_rewards_model),
        "opponent_avg": np.mean(total_rewards_opponent),
        "win_rate": win_rate
    }


def calculate_competitive_reward(state, next_state, reward, model_action, opponent_action, opponent_type, done):
    # Base environmental reward
    competitive_reward = reward

    # Expert alignment reward (more nuanced than current binary approach)
    if opponent_type == "expert":
        # Graduated reward based on Q-value difference between actions
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            q_values = expert_model(state_tensor)[0]
            model_q = q_values[model_action].item()
            expert_q = q_values[opponent_action].item()
            q_diff = expert_q - model_q

            # Scaled reward bonus with diminishing returns
            alignment_bonus = 1.0 if model_action == opponent_action else max(-0.5, -0.1 * min(5.0, abs(q_diff)))
            competitive_reward += alignment_bonus

    # Stability reward - extra reward for keeping lander stable (based on state values)
    if not done:
        # Extract angular velocity from state
        angular_velocity = abs(state[5])
        stability_bonus = 0.2 * (1.0 - min(1.0, angular_velocity/0.5))
        competitive_reward += stability_bonus

    # Landing bonus - large bonus for safe landings
    if done and reward > 100:  # Successful landing
        competitive_reward += 5.0

    return competitive_reward


def select_strategic_opponents(model_versions, best_model, n=3):
    if len(model_versions) <= n:
        return model_versions

    # Always include expert
    expert = next((m for m in model_versions if m["name"] == "Expert"), None)
    selected = [expert] if expert else []

    # Calculate difficulty scores for each opponent
    difficulty_scores = []
    for opponent_info in model_versions:
        if opponent_info in selected:
            continue

        # Run a quick evaluation to see how challenging this opponent is
        results = quick_evaluate_against(best_model, opponent_info["model"], episodes=2)

        # Score based on how close the win rate is to 0.5 (most educational)
        challenge_score = 1.0 - abs(results["win_rate"] - 0.5) * 2.0
        difficulty_scores.append((opponent_info, challenge_score))

    # Select most challenging opponents
    difficulty_scores.sort(key=lambda x: x[1], reverse=True)
    selected.extend([info for info, _ in difficulty_scores[:n-len(selected)]])

    return selected


def evaluate_policy(model, env, episodes=5):
    """Evaluate a policy without exploration"""
    model.eval()
    total_rewards = []
    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action = model(state_tensor).argmax(dim=1).item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            state = next_state
        total_rewards.append(total_reward)
    model.train()
    return np.mean(total_rewards)
def compute_enhanced_composite_loss(state_action_values, expected_values,
                                   logits, expert_actions, prev_logits,
                                   states, next_states, actions,
                                   alpha=0.7, beta=0.2,  # Using direct values instead of globals
                                   batch_size=128):      # Adding batch_size parameter
    """Enhanced composite loss with RL, imitation, policy regularization, and safety terms"""

    # 1. RL loss (TD error)
    rl_loss = F.smooth_l1_loss(state_action_values, expected_values)

    # 2. Imitation loss (when expert actions are available)
    imitation_loss = torch.tensor(0.0, device=device)
    if expert_actions is not None:
        expert_logits = logits[batch_size//2:]
        imitation_loss = F.cross_entropy(expert_logits, expert_actions)

    # 3. Policy regularization (KL divergence)
    div_loss = torch.tensor(0.0, device=device)
    if prev_logits is not None:
        log_probs = F.log_softmax(logits, dim=1)
        prev_probs = F.softmax(prev_logits.detach(), dim=1)
        div_loss = F.kl_div(log_probs, prev_probs, reduction='batchmean')

    # 4. NEW: Safety loss - penalize states that lead to crashes - FIX HERE
    safety_loss = torch.tensor(0.0, device=device)
    if next_states is not None:
        # Identify crashes (strongly negative rewards)
        crashes = torch.tensor([r < -50 for r in expected_values], device=device)

        if crashes.any():
            # Extract Q-values for actions that led to crashes
            crash_indices = torch.where(crashes)[0]

            # Fix: Ensure action indices have the right shape and type for gather
            crash_actions = actions[crash_indices]
            if not isinstance(crash_actions, torch.LongTensor) and not isinstance(crash_actions, torch.cuda.LongTensor):
                crash_actions = crash_actions.long()

            # Gather Q-values for the actions that led to crashes
            crash_q_values = logits[crash_indices].gather(1, crash_actions)

            # Strongly penalize these actions
            safety_target = torch.full_like(crash_q_values, -100.0)
            safety_loss = F.mse_loss(crash_q_values, safety_target)

    # 5. Entropy bonus for exploration when not learning from expert
    entropy_loss = torch.tensor(0.0, device=device)
    if expert_actions is None:
        probs = F.softmax(logits, dim=1)
        log_probs = F.log_softmax(logits, dim=1)
        entropy = -(probs * log_probs).sum(dim=1).mean()
        # Negative because we want to maximize entropy (exploration)
        entropy_loss = -0.01 * entropy

    # Combine all losses with weights
    combined_loss = (rl_loss +
                    alpha * imitation_loss +
                    beta * div_loss +
                    0.5 * safety_loss +
                    entropy_loss)

    return combined_loss, {
        'rl': rl_loss.item(),
        'imitation': imitation_loss.item(),
        'divergence': div_loss.item(),
        'safety': safety_loss.item(),
        'entropy': entropy_loss.item()
    }

# Self-play training with multi-generational competition
print("Starting enhanced self-play training...")

# Load different generations of models
model_versions = []
try:
    # Load DAgger models
    for i in range(1, 6):  # Load all 5 DAgger iterations
        model_path = f"models/dagger_model_iter{i}.pth"
        model = DQN(n_observations, n_actions).to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()  # Set to evaluation mode
        model_versions.append({"model": model, "name": f"DAgger-{i}"})
    print(f"Loaded {len(model_versions)} previous model versions")

    # Add expert model as another competitor
    expert_model = DQN(n_observations, n_actions).to(device)
    expert_model.load_state_dict(torch.load("models/dqn_lunar_lander.pth", map_location=device))
    expert_model.eval()
    model_versions.append({"model": expert_model, "name": "Expert"})
    print("Added expert model to competitors")
except Exception as e:
    print(f"Couldn't load all model versions: {e}")
    print("Starting with only available models")

# Current best model (student) that we'll continue to improve
best_model = DQN(n_observations, n_actions).to(device)
try:
    best_model.load_state_dict(torch.load("models/dagger_model_iter5.pth", map_location=device))
    print("Loaded latest DAgger model as starting point")
except:
    print("Starting with a fresh model")

# Create target model for stable learning
target_model = DQN(n_observations, n_actions).to(device)
target_model.load_state_dict(best_model.state_dict())
target_model.eval()

# Memory buffers - one standard and one for expert demonstrations
memory = ReplayMemory(MEMORY_SIZE)
expert_memory = ReplayMemory(MEMORY_SIZE // 2)  # Smaller buffer for expert demos

optimizer = optim.Adam(best_model.parameters(), lr=LR)

# Fill expert memory with some expert demonstrations
print("Collecting expert demonstrations...")
expert_model = DQN(n_observations, n_actions).to(device)
expert_model.load_state_dict(torch.load("models/dqn_lunar_lander.pth", map_location=device))
expert_model.eval()

for _ in tqdm(range(20), desc="Expert Demo Collection"):
    state, _ = env.reset()
    done = False

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        with torch.no_grad():
            action = expert_model(state_tensor).argmax(dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Store expert transition
        expert_memory.push(
            state,
            action,
            next_state if not done else None,
            reward,
            done
        )

        state = next_state

# Performance tracker
performance_history = []
best_reward = evaluate_policy(best_model, env)
print(f"Initial performance: {best_reward:.2f}")

# Create model history folder
os.makedirs("models/competition_history", exist_ok=True)

# Main training loop
for iteration in range(N_ITERATIONS):
    print(f"\nIteration {iteration+1}/{N_ITERATIONS}")

    # Use strategic opponent selection instead of random sampling
    if model_versions:
        # Define the quick evaluation function used by select_strategic_opponents
        def quick_evaluate_against(model, opponent_model, episodes=2):
            return evaluate_against_opponent(model, opponent_model, env, episodes)

        opponents = select_strategic_opponents(model_versions, best_model, n=3)
        for opponent in opponents:
            print(f"Competing against: {opponent['name']}")
    else:
        opponents = []
        print("No opponent models available")

    # 1. Collect experience through standard self-play
    for episode in tqdm(range(EPISODES_PER_ITER), desc="Collecting experience"):
        state, _ = env.reset()
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

            # Epsilon-greedy action selection (with small epsilon)
            if random.random() < 0.05:  # 5% exploration
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    action = best_model(state_tensor).argmax(dim=1).item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Store transition
            memory.push(
                state,
                action,
                next_state if not done else None,
                reward,
                done
            )

            state = next_state

    # 2. Collect experience against opponent models
    for opponent_info in opponents:
        opponent = opponent_info["model"]

        for episode in tqdm(range(max(1, EPISODES_PER_ITER // 3)), desc=f"Competing vs {opponent_info['name']}"):
            state, _ = env.reset()
            # Store previous state and action for comparison
            trajectory = []
            done = False

            while not done:
                state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

                # Get action from current model
                with torch.no_grad():
                    model_action = best_model(state_tensor).argmax(dim=1).item()

                # Get action from opponent model
                with torch.no_grad():
                    opponent_action = opponent(state_tensor).argmax(dim=1).item()

                # Execute model's action
                next_state, reward, terminated, truncated, _ = env.step(model_action)
                done = terminated or truncated

                # Calculate "competitive reward" - bonus if model action matches expert
                # This creates a learning signal encouraging better performance
                if opponent == expert_model:
                    competitive_reward = reward + (1.0 if model_action == opponent_action else -0.1)
                else:
                    competitive_reward = reward

                # Store transition with modified reward
                memory.push(
                    state,
                    model_action,
                    next_state if not done else None,
                    competitive_reward,
                    done
                )

                # Store opponent's predicted action for this state
                trajectory.append((state, opponent_action))

                state = next_state

            # After episode terminates, store expert trajectories with higher priority
            if opponent == expert_model:
                for s, a in trajectory:
                    expert_memory.push(
                        s,
                        a,
                        None,  # Don't need next state for imitation
                        0,     # Reward doesn't matter for imitation
                        True   # Done flag doesn't matter for imitation
                    )

    # Save snapshot of current version before training further
    torch.save(best_model.state_dict(), f"models/competition_history/model_iter{iteration+1}_pre.pth")

    # 3. Train on collected experiences with multi-objective loss
    if len(memory) >= BATCH_SIZE:
        print(f"Training on {len(memory)} transitions...")

        # Load previous version for regularization
        prev_model = DQN(n_observations, n_actions).to(device)
        prev_model.load_state_dict(best_model.state_dict())
        prev_model.eval()

        loss_stats = {'rl': [], 'imitation': [], 'divergence': [], 'safety': [], 'entropy': []}

        for step in tqdm(range(500), desc="Training"):  # Train for 500 batches
            # Sometimes include expert demonstrations
            use_expert = (step % 5 == 0) and len(expert_memory) > BATCH_SIZE

            if use_expert:
                # Mix regular and expert transitions
                reg_transitions = memory.sample(BATCH_SIZE // 2)
                exp_transitions = expert_memory.sample(BATCH_SIZE // 2)
                transitions = reg_transitions + exp_transitions
            else:
                transitions = memory.sample(BATCH_SIZE)

            batch = Transition(*zip(*transitions))

            # Process batch data for training
            non_final_mask = torch.tensor(
                tuple(map(lambda s: s is not None, batch.next_state)),
                device=device, dtype=torch.bool)

            non_final_next_states = torch.tensor(
                np.array([s for s in batch.next_state if s is not None]),
                dtype=torch.float32, device=device)

            state_batch = torch.tensor(np.array(batch.state), dtype=torch.float32, device=device)
            action_batch = torch.tensor(batch.action, device=device).unsqueeze(1)
            reward_batch = torch.tensor(batch.reward, device=device)
            done_batch = torch.tensor(batch.done, device=device, dtype=torch.bool)

            # Get current Q values
            q_values = best_model(state_batch)
            state_action_values = q_values.gather(1, action_batch)

            # Compute expected Q values with target network
            next_state_values = torch.zeros(BATCH_SIZE, device=device)
            with torch.no_grad():
                next_state_values[non_final_mask] = target_model(non_final_next_states).max(1)[0]

            expected_state_action_values = (next_state_values * GAMMA * (~done_batch)) + reward_batch

            # Get previous policy logits
            with torch.no_grad():
                prev_logits = prev_model(state_batch)

            # For expert demos, get expert actions for imitation loss
            expert_actions = None
            if use_expert:
                # Only use actions from expert transitions (second half of batch)
                expert_actions = action_batch[BATCH_SIZE//2:].squeeze()

            # Compute the enhanced composite loss
            loss, metrics = compute_enhanced_composite_loss(
                state_action_values,
                expected_state_action_values.unsqueeze(1),
                q_values,
                expert_actions,
                prev_logits,
                state_batch,  # Added state batch parameter
                non_final_next_states if non_final_mask.any() else None,  # Added next states parameter
                action_batch,  # Added actions parameter
                alpha=ALPHA,
                beta=BETA
            )

            # Update loss statistics
            for k, v in metrics.items():
                loss_stats[k].append(v)

            # Optimize
            optimizer.zero_grad()
            loss.backward()
            # Clip gradients to stabilize training
            torch.nn.utils.clip_grad_norm_(best_model.parameters(), 10.0)
            optimizer.step()

            # Soft update target network
            for target_param, local_param in zip(target_model.parameters(), best_model.parameters()):
                target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)

            avg_rl = np.mean(loss_stats['rl'])
            avg_imt = np.mean(loss_stats['imitation'])
            avg_div = np.mean(loss_stats['divergence'])
            avg_saf = np.mean(loss_stats['safety'])
            avg_ent = np.mean(loss_stats['entropy'])

            # *** Added: record these averages into loss_history
            loss_history.append({
                'rl': avg_rl,
                'imitation': avg_imt,
                'divergence': avg_div,
                'safety': avg_saf,
                'entropy': avg_ent
            })

        # Print average loss components
        print(f"  RL Loss: {np.mean(loss_stats['rl']):.4f}, " +
              f"Imitation Loss: {np.mean(loss_stats['imitation']):.4f}, " +
              f"Divergence: {np.mean(loss_stats['divergence']):.4f}, " +
              f"Safety: {np.mean(loss_stats['safety']):.4f}, " +
              f"Entropy: {np.mean(loss_stats['entropy']):.4f}")

    # 4. Evaluate against each opponent and against environment
    print("Evaluating against opponents...")
    competition_results = {}
    for opponent_info in opponents:
        name = opponent_info["name"]
        opponent = opponent_info["model"]
        results = evaluate_against_opponent(best_model, opponent, env, episodes=3)
        competition_results[name] = results
        print(f"  vs {name}: Win Rate: {results['win_rate']:.2f}, " +
              f"Avg Reward: {results['model_avg']:.2f} vs {results['opponent_avg']:.2f}")

    # Standard evaluation
    current_reward = evaluate_policy(best_model, env)
    print(f"Environment performance: {current_reward:.2f} (best: {best_reward:.2f})")

    # Track performance
    performance_history.append({
        'iteration': iteration + 1,
        'reward': current_reward,
        'competition': competition_results
    })

    # Save if improved
    if current_reward > best_reward:
        best_reward = current_reward
        torch.save(best_model.state_dict(), f"models/self_play_model_iter{iteration+1}.pth")
        print(f"  Saved improved model with reward {best_reward:.2f}")

        # Add this version to the pool of competitors for future training
        if iteration > 0 and iteration % 3 == 0:  # Add every 3rd improved model
            new_competitor = DQN(n_observations, n_actions).to(device)
            new_competitor.load_state_dict(best_model.state_dict())
            new_competitor.eval()
            model_versions.append({
                "model": new_competitor,
                "name": f"SelfPlay-{iteration+1}"
            })
            print(f"Added model version SelfPlay-{iteration+1} to competitor pool")

for h in performance_history:
    all_opps.update(h['competition'].keys())
opponent_names = sorted(all_opps)

# Save final model
print(f"Self-play training complete! Best reward: {best_reward:.2f}")
torch.save(best_model.state_dict(), "models/enhanced_self_play_best.pth")

# Plot performance over iterations
plt.figure(figsize=(12, 6))
rewards = [p['reward'] for p in performance_history]
plt.plot(range(1, len(rewards)+1), rewards, marker='o')
plt.xlabel('Iteration')
plt.ylabel('Average Reward')
plt.title('Self-Play Performance Over Time')
plt.savefig('self_play_performance.png')
plt.show()

In [None]:
iterations = [h['iteration'] for h in performance_history]
rewards    = [h['reward']    for h in performance_history]

plt.figure()
plt.plot(iterations, rewards, marker='o')
plt.xlabel('Iteration')
plt.ylabel('Average Reward')
plt.title('Self-Play Performance Over Iterations')
plt.grid(True)
plt.show()

In [None]:
plt.plot(losses)
plt.xlabel('Updates')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.show()

In [None]:
components = ['rl', 'imitation', 'divergence', 'safety', 'entropy']

for comp in components:
    vals = [lh[comp] for lh in loss_history]
    plt.figure()
    plt.plot(range(1, len(vals)+1), vals, marker='.')
    plt.xlabel('Iteration')
    plt.ylabel(f'{comp.capitalize()} Loss')
    plt.title(f'{comp.capitalize()} Loss Over Iterations')
    plt.grid(True)
    plt.show()

In [None]:
for opp in opponent_names:
    iters, win_rates = [], []
    for h in performance_history:
        if opp in h['competition']:
            iters.append(h['iteration'])
            win_rates.append(h['competition'][opp]['win_rate'])
    if not iters:
        continue

    plt.figure()
    plt.plot(iters, win_rates, marker='x')
    plt.xlabel('Iteration')
    plt.ylabel('Win Rate')
    plt.ylim(0, 1)
    plt.title(f'Win Rate vs {opp}')
    plt.grid(True)
    plt.show()

In [None]:
for opp in opponent_names:
    iters, model_avgs, opp_avgs = [], [], []
    for h in performance_history:
        if opp in h['competition']:
            iters.append(h['iteration'])
            model_avgs.append(h['competition'][opp]['model_avg'])
            opp_avgs.append(h['competition'][opp]['opponent_avg'])
    if not iters:
        continue

    plt.figure()
    plt.plot(iters, model_avgs, marker='o', label='Model')
    plt.plot(iters, opp_avgs,   marker='s', label=opp)
    plt.xlabel('Iteration')
    plt.ylabel('Average Reward')
    plt.title(f'Avg Reward: Model vs {opp}')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
for opp in opponent_names:
    iters, diffs = [], []
    for h in performance_history:
        if opp in h['competition']:
            iters.append(h['iteration'])
            m = h['competition'][opp]['model_avg']
            o = h['competition'][opp]['opponent_avg']
            diffs.append(m - o)
    if not iters:
        continue

    plt.figure()
    plt.plot(iters, diffs, marker='d')
    plt.xlabel('Iteration')
    plt.ylabel('Reward Difference')
    plt.title(f'Reward Difference (Model − {opp})')
    plt.axhline(0, linestyle='--')
    plt.grid(True)
    plt.show()

In [None]:
cummax_rewards = np.maximum.accumulate([h['reward'] for h in performance_history])
plt.figure()
plt.plot(iterations, cummax_rewards, marker='>')
plt.xlabel('Iteration')
plt.ylabel('Best Reward So Far')
plt.title('Cumulative Best Reward Over Iterations')
plt.grid(True)
plt.show()

In [None]:
window = 3
rewards = [h['reward'] for h in performance_history]
mov_avg = [np.mean(rewards[max(0, i-window+1):i+1]) for i in range(len(rewards))]

plt.figure()
plt.plot(iterations, rewards,    marker='o', label='Raw')
plt.plot(iterations, mov_avg,    marker='*', label=f'{window}-Itr Moving Avg')
plt.xlabel('Iteration')
plt.ylabel('Average Reward')
plt.title('Raw vs. Moving Average Reward')
plt.legend()
plt.grid(True)
plt.show()

## 8. Final Evaluation

Compare all trained models to see which performs best.

In [None]:
# Setup for evaluation
env = gym.make("LunarLander-v3", render_mode="rgb_array")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

# Load all models to compare
models = {
    "Expert DQN": "models/dqn_lunar_lander.pth",
    "Behavioral Cloning": "models/imitation_model.pth",
    "DAgger": "models/dagger_model_iter5.pth",
    "Enhanced Self-Play": "models/enhanced_self_play_best.pth",
}

# Function to evaluate a model
def evaluate_model_with_render(model_path, n_episodes=10):
    model = DQN(n_observations, n_actions).to(device)
    try:
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
    except Exception as e:
        print(f"Failed to load model: {model_path}, error: {e}")
        return []

    rewards = []
    for i in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action = model(state_tensor).argmax(dim=1).item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            state = next_state

        rewards.append(total_reward)
        print(f"  Episode {i+1}: {total_reward:.2f}")

    return rewards

In [None]:
# Evaluate all models
results = {}
for name, path in models.items():
    try:
        print(f"Evaluating {name}...")
        rewards = evaluate_model_with_render(path)
        results[name] = rewards
        if rewards:  # Only calculate if we got valid rewards
            print(f"  Mean reward: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}")
    except Exception as e:
        print(f"Error evaluating {name}: {e}")

env.close()

In [None]:
# Plot results
plt.figure(figsize=(12, 6))
valid_models = {name: results[name] for name in results if results[name]}
plt.boxplot([valid_models[name] for name in valid_models.keys()], labels=list(valid_models.keys()))
plt.ylabel('Total Reward')
plt.title('Performance Comparison of Different Models')
plt.savefig('model_comparison.png')
plt.show()

# Print summary
print("\nSummary:")
for name in valid_models:
    rewards = valid_models[name]
    print(f"{name}: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}")

In [None]:
model_names = list(results.keys())
# Compute means and stds
means = [np.mean(results[name]) for name in model_names]
stds  = [np.std(results[name])  for name in model_names]

# 1. Bar chart of mean ± std
plt.figure()
plt.bar(model_names, means, yerr=stds, capsize=5)
plt.ylabel('Average Reward')
plt.title('Comparison of Model Performance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
plt.boxplot([results[name] for name in model_names], labels=model_names)
plt.ylabel('Reward')
plt.title('Reward Distributions Across Models')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
for name in model_names:
    rewards = results[name]
    plt.plot(range(1, len(rewards)+1), rewards, marker='o', label=name)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Rewards per Model')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 10. Conclusion

In this notebook, we've implemented a comprehensive reinforcement learning workflow including:

1. Expert DQN training on the LunarLander-v3 environment
2. Generation of expert trajectories from the trained DQN model
3. Behavioral Cloning to create an initial student model from expert demonstrations
4. DAgger (Dataset Aggregation) to improve the student model with additional expert-labeled data
5. Enhanced Self-Play reinforcement learning with multi-agent competition dynamics
   - Competition between different model versions
   - Multi-objective optimization (imitation, competition, and policy regularization)
   - Generation-based model pool for diverse opponents
7. Comprehensive evaluation of all approaches

This combined approach leverages imitation learning (learning from expert demonstrations), reinforcement learning (learning from experience), and model compression techniques to create a robust, efficient agent that can effectively solve the lunar landing task while maintaining a small model size.

Our enhanced self-play mechanism adapted concepts from multi-agent training to a single-agent environment by creating a competitive dynamic between different versions of our policy, leading to more robust performance.