In [None]:
import gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


In [None]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.layers(x)


In [None]:
class ReplayBuffer:
    def __init__(self, capacity=100_000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (np.array(states), np.array(actions), np.array(rewards),
                np.array(next_states), np.array(dones))

    def __len__(self):
        return len(self.buffer)


In [None]:
def train(env, n_episodes=1000, gamma=0.99, lr=1e-4, batch_size=64, 
          buffer_size=100_000, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995,
          target_update_freq=10):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    target_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(capacity=buffer_size)
    epsilon = epsilon_start

    rewards_per_episode = []

    for episode in range(n_episodes):
        state, _ = env.reset(seed=SEED)
        done = False
        total_reward = 0

        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                    action = policy_net(state_tensor).argmax().item()

            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated
            replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if len(replay_buffer) >= batch_size:
                states, actions, rewards_batch, next_states, dones = replay_buffer.sample(batch_size)

                states = torch.FloatTensor(states).to(device)
                actions = torch.LongTensor(actions).unsqueeze(1).to(device)
                rewards_batch = torch.FloatTensor(rewards_batch).unsqueeze(1).to(device)
                next_states = torch.FloatTensor(next_states).to(device)
                dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

                q_values = policy_net(states).gather(1, actions)
                next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
                expected_q = rewards_batch + gamma * next_q_values * (1 - dones)

                loss = nn.MSELoss()(q_values, expected_q)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        rewards_per_episode.append(total_reward)
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        if episode % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if episode % 10 == 0:
            mean_reward = np.mean(rewards_per_episode[-10:])
            print(f"Episode {episode}, Mean Reward (last 10): {mean_reward:.2f}, Epsilon: {epsilon:.3f}")

    return policy_net, rewards_per_episode


In [None]:
def evaluate(env, policy_net, n_episodes=20):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    rewards = []

    for _ in range(n_episodes):
        state, _ = env.reset(seed=SEED)
        done = False
        total_reward = 0

        while not done:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                action = torch.argmax(policy_net(state_tensor)).item()
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated
            state = next_state
            total_reward += reward

        rewards.append(total_reward)

    avg_reward = np.mean(rewards)
    print(f"Average evaluation reward over {n_episodes} episodes: {avg_reward:.2f}")
    return avg_reward


In [None]:
env = gym.make("LunarLander-v2")
policy_net, rewards = train(env, n_episodes=500)
evaluate(env, policy_net)

plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Training Rewards")
plt.grid()
plt.show()
