<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import random
import numpy as np
import torch.nn.functional as F  # Import torch.nn.functional as F

# Define the Q-Network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Replay Buffer for storing experience
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.buffer = []

    def add(self, experience):
        if len(self.buffer) >= self.buffer_size:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self):
        return random.sample(self.buffer, self.batch_size)

    def __len__(self):
        return len(self.buffer)

# Environment setup
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Initialize Q-Network, optimizer, and loss function
q_network = QNetwork(state_size, action_size)
optimizer = optim.Adam(q_network.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

# Initialize Replay Buffer
replay_buffer = ReplayBuffer(buffer_size=10000, batch_size=64)

# Hyperparameters
num_episodes = 500
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration-exploitation trade-off
epsilon_decay = 0.995
min_epsilon = 0.01

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for t in range(200):
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_network(torch.Tensor(state)).detach().numpy())

        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        replay_buffer.add((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if done:
            break

        if len(replay_buffer) >= replay_buffer.batch_size:
            batch = replay_buffer.sample()
            batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*batch)

            batch_states = torch.Tensor(np.vstack(batch_states))
            batch_actions = torch.LongTensor(batch_actions)
            batch_rewards = torch.Tensor(batch_rewards)
            batch_next_states = torch.Tensor(np.vstack(batch_next_states))
            batch_dones = torch.Tensor(batch_dones)

            q_values = q_network(batch_states).gather(1, batch_actions.unsqueeze(1)).squeeze(1)
            next_q_values = q_network(batch_next_states).max(1)[0]
            target_q_values = batch_rewards + (gamma * next_q_values * (1 - batch_dones))

            loss = loss_fn(q_values, target_q_values.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")

# Evaluate the trained model
state = env.reset()
state = np.reshape(state, [1, state_size])
for t in range(200):
    action = np.argmax(q_network(torch.Tensor(state)).detach().numpy())
    next_state, reward, done, _ = env.step(action)
    env.render()
    state = np.reshape(next_state, [1, state_size])
    if done:
        break

env.close()