<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Deep_Q_Learning_(DQN)_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_dqn(env, episodes=1000):
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    model = DQN(state_size, action_size)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    gamma = 0.99
    epsilon = 1.0
    epsilon_decay = 0.995
    epsilon_min = 0.01
    batch_size = 32
    memory = []

    def remember(state, action, reward, next_state, done):
        memory.append((state, action, reward, next_state, done))

    def act(state):
        if np.random.rand() <= epsilon:
            return env.action_space.sample()
        q_values = model(torch.FloatTensor(state))
        return np.argmax(q_values.detach().numpy())

    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            action = act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print(f"Episode {e}/{episodes}, score: {time}")
                break
            if len(memory) > batch_size:
                minibatch = random.sample(memory, batch_size)
                for state, action, reward, next_state, done in minibatch:
                    target = reward
                    if not done:
                        target += gamma * torch.max(model(torch.FloatTensor(next_state)).detach())
                    target_f = model(torch.FloatTensor(state))
                    target_f[0][action] = target
                    model.train()
                    outputs = model(torch.FloatTensor(state))
                    loss = criterion(outputs, target_f)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

env = gym.make('CartPole-v1', new_step_api=True)
train_dqn(env)