<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL)_with_Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip show gym

In [None]:
pip install --upgrade gym

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

# Define the DQN network
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
learning_rate = 0.001
gamma = 0.99
batch_size = 32
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
memory_size = 10000
target_update_frequency = 10

# Initialize environment and DQN models
env = gym.make('CartPole-v1')  # Ensure compatibility with current Gym versions
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_net = DQN(input_dim, output_dim).to(device)
target_net = DQN(input_dim, output_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
memory = deque(maxlen=memory_size)

# Define action selection (epsilon-greedy policy)
def select_action(state, test_mode=False):
    if not test_mode and np.random.rand() < epsilon:
        return random.choice(range(output_dim))  # Random action (exploration)
    else:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        q_values = policy_net(state_tensor)
        return torch.argmax(q_values).item()  # Action with max Q-value (exploitation)

# Optimize the model using experience replay
def optimize_model():
    if len(memory) < batch_size:
        return

    transitions = random.sample(memory, batch_size)
    batch = list(zip(*transitions))

    states = torch.tensor(batch[0], dtype=torch.float32).to(device)
    actions = torch.tensor(batch[1], dtype=torch.long).to(device)
    rewards = torch.tensor(batch[2], dtype=torch.float32).to(device)
    next_states = torch.tensor(batch[3], dtype=torch.float32).to(device)
    dones = torch.tensor(batch[4], dtype=torch.float32).to(device)

    # Compute Q-learning targets
    current_q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_net(next_states).max(1)[0]
    expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

    # Compute the loss and update policy_net
    loss = nn.MSELoss()(current_q_values, expected_q_values)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)  # Gradient clipping
    optimizer.step()

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state, _ = env.reset() if isinstance(env.reset(), tuple) else (env.reset(), None)
    total_reward = 0
    done = False

    while not done:
        action = select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action) if hasattr(env, 'step') else env.step(action)
        done = terminated or truncated
        total_reward += reward

        memory.append((state, action, reward, next_state, float(done)))
        state = next_state

        optimize_model()

    # Decay epsilon
    global epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update the target network periodically
    if episode % target_update_frequency == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

# Test the trained agent
state, _ = env.reset() if isinstance(env.reset(), tuple) else (env.reset(), None)
total_reward = 0
done = False
while not done:
    action = select_action(state, test_mode=True)
    next_state, reward, terminated, truncated, _ = env.step(action) if hasattr(env, 'step') else env.step(action)
    done = terminated or truncated
    total_reward += reward
    state = next_state
print(f"Test Reward: {total_reward}")