In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gymnasium as gym
import random
from collections import deque

#### Define the neural network

In [None]:
class DQN(nn.Module):
    def __init__(self, n_obs, n_act):
        super(DQN, self).__init__()
        self.l1 = nn.Linear(n_obs, 128)
        self.l2 = nn.Linear(128, 128)
        self.l3 = nn.Linear(128, n_act)

    def forward(self, x):
        out = F.relu(self.l1(x))
        out = F.relu(self.l2(out))
        return self.l3(out)

#### Initialize memory, environment and neural network

In [None]:
memory = deque(maxlen=2000) #memory for storing state, action, reward, next state and terminal values for every state

env = gym.make("CartPole-v1", render_mode="rgb_array")
state, info = env.reset()
n_obs, n_act = len(state), env.action_space.n

net = DQN(n_obs, n_act) #initialize the Deep Q-Network
optimizer = optim.Adam(net.parameters(), lr=0.001)
device = torch.device("cpu")

#### Define the parameters

In [None]:
gamma = 0.95 #discount rate
epsilon = 1.0 #exploration rate
eps_decay = 0.996
eps_min = 0.01
batch_size = 128 #batch size for training after every episode
episodes = 10000
max_mem_size = 2000
total_reward = 0.

#### Define prediction and training functions

In [None]:
def act(state):
    if random.random() <= epsilon:
        return env.action_space.sample()
    act_val = net(torch.tensor(state, dtype=torch.float))
    return torch.argmax(act_val).item()

def train(memory, batch_size, epsilon, eps_decay, gamma):
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target = (reward + gamma * torch.max(net(torch.tensor(next_state, dtype=torch.float))).item())
        target_f = net(torch.tensor(state, dtype=torch.float))
        target_f[action] = target
        optimizer.zero_grad()
        loss = F.mse_loss(target_f, net(torch.tensor(state, dtype=torch.float)))
        loss.backward()
        optimizer.step()
    return epsilon * eps_decay
        

#### Execute the loop

In [None]:
for e in range(1, episodes):
    state = env.reset()
    state = np.array(state[0])
    score = 0.
    done = False
    while not done:
        action = act(state)
        next_state, reward, done, _, _ = env.step(action)
        reward = reward if not done else -10
        score += float(reward)
        memory.append((state, action, reward, next_state, done))
        state = next_state
    if e % 100 == 0:
        print("Episode:", e, "Mean Score:", total_reward / e)
    if len(memory) > batch_size:
        epsilon = train(memory, batch_size, epsilon, eps_decay, gamma)
    total_reward += score

#### Evaluate the model

In [None]:
env = gym.make("CartPole-v1", render_mode="human")

while True:
    state = env.reset()
    state = np.array(state[0])
    score = 0.
    done = False
    while not done:
        action = act(state)
        next_state, reward, done, _, _ = env.step(action)
        reward = reward if not done else -10
        score += float(reward)
        state = next_state
    print("Score:", score)