<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

# Define the Q-network
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Hyperparameters
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 500
learning_rate = 1e-3
buffer_capacity = 10000
batch_size = 64

# Initialize environment, model, optimizer, and buffer
env = gym.make("CartPole-v1")
q_net = DQN(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)
replay_buffer = ReplayBuffer(buffer_capacity)

def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            return q_net(torch.tensor(state, dtype=torch.float32)).argmax().item()

# Training loop
num_episodes = 500
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
        action = select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(replay_buffer) >= batch_size:
            transitions = replay_buffer.sample(batch_size)
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)

            batch_state = torch.tensor(batch_state, dtype=torch.float32)
            batch_action = torch.tensor(batch_action, dtype=torch.long)
            batch_reward = torch.tensor(batch_reward, dtype=torch.float32)
            batch_next_state = torch.tensor(batch_next_state, dtype=torch.float32)
            batch_done = torch.tensor(batch_done, dtype=torch.float32)

            q_values = q_net(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
            next_q_values = q_net(batch_next_state).max(1)[0]
            expected_q_values = batch_reward + gamma * next_q_values * (1 - batch_done)

            loss = nn.functional.mse_loss(q_values, expected_q_values)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print(f"Episode {episode}, Total Reward: {total_reward}")

print("Training complete.")