<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Deep_Q_Networks_(DQN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import random
import numpy as np
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_dqn(env, model, episodes=1000):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()
    gamma = 0.99
    epsilon = 1.0
    epsilon_decay = 0.995
    epsilon_min = 0.01
    replay_buffer = deque(maxlen=2000)
    batch_size = 64

    for episode in range(episodes):
        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32)
        total_reward = 0
        done = False

        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = model(state)
                    action = torch.argmax(q_values).item()

            next_state, reward, done, _ = env.step(action)
            next_state = torch.tensor(next_state, dtype=torch.float32)
            total_reward += reward

            replay_buffer.append((state, action, reward, next_state, done))

            state = next_state

            if len(replay_buffer) > batch_size:
                minibatch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*minibatch)
                states = torch.stack(states)
                actions = torch.tensor(actions).unsqueeze(1)
                rewards = torch.tensor(rewards).unsqueeze(1)
                next_states = torch.stack(next_states)
                dones = torch.tensor(dones).unsqueeze(1)

                with torch.no_grad():
                    # Use torch.logical_not instead of (1 - dones)
                    target_q = rewards + torch.logical_not(dones) * gamma * torch.max(model(next_states), dim=1, keepdim=True)[0]

                q_values = model(states).gather(1, actions)

                loss = loss_fn(q_values, target_q)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Episode {episode+1}, Total Reward: {total_reward}")

env = gym.make('CartPole-v1')
model = DQN(state_size=env.observation_space.shape[0], action_size=env.action_space.n)
train_dqn(env, model)