<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

# Define the neural network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
lr = 0.001
batch_size = 64
memory_size = 10000
target_update = 10
num_episodes = 500

# Environment
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Replay memory
memory = deque(maxlen=memory_size)

# Model and optimizer
policy_net = DQN(state_dim, action_dim)
target_net = DQN(state_dim, action_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)

# Epsilon-greedy policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.randrange(action_dim)
    else:
        with torch.no_grad():
            return policy_net(state).argmax().item()

# Training function
def optimize_model():
    if len(memory) < batch_size:
        return
    transitions = random.sample(memory, batch_size)
    batch_state, batch_action, batch_next_state, batch_reward, batch_done = zip(*transitions)

    # Convert lists of numpy arrays to tensors and ensure correct shapes
    batch_state = torch.tensor(np.array(batch_state), dtype=torch.float32).squeeze(1)  # Squeeze extra dimension
    batch_action = torch.tensor(np.array(batch_action), dtype=torch.long).view(-1, 1)
    batch_reward = torch.tensor(np.array(batch_reward), dtype=torch.float32).view(-1, 1)
    batch_next_state = torch.tensor(np.array(batch_next_state), dtype=torch.float32).squeeze(1)  # Squeeze extra dimension
    batch_done = torch.tensor(np.array(batch_done), dtype=torch.float32).view(-1, 1)

    # Print shapes for debugging
    print(f"batch_state shape: {batch_state.shape}")
    print(f"batch_action shape: {batch_action.shape}")

    # Calculate current Q values
    current_q_values = policy_net(batch_state).gather(1, batch_action)

    # Print shape for debugging
    print(f"current_q_values shape: {current_q_values.shape}")

    # Calculate next Q values
    next_q_values = target_net(batch_next_state).max(1)[0].detach().view(-1, 1)
    target_q_values = batch_reward + (gamma * next_q_values * (1 - batch_done))

    # Print shapes for debugging
    print(f"next_q_values shape: {next_q_values.shape}")
    print(f"target_q_values shape: {target_q_values.shape}")

    loss = nn.functional.mse_loss(current_q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Training loop
epsilon = epsilon_start
for episode in range(num_episodes):
    state = env.reset()
    state = torch.tensor([state], dtype=torch.float32)
    total_reward = 0
    done = False

    while not done:
        action = select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor([next_state], dtype=torch.float32)
        memory.append((state, action, next_state, reward, done))

        state = next_state
        total_reward += reward

        optimize_model()

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(epsilon_end, epsilon_decay * epsilon)
    print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")

env.close()