# Install Library

In [1]:
!pip install gym torch numpy matplotlib

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting torch
  Downloading torch-2.2.2-cp38-cp38-manylinux1_x86_64.whl.metadata (25 kB)
Collecting numpy
  Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting matplotlib
  Using cached matplotlib-3.7.5-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.7 kB)
Collecting cloudpickle>=1.2.0 (from gym)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting gym-notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Collecting filelock (from torch)
  Downloading filelock-3.13.3-py3-none-any.

# Import Library

In [2]:
import gym
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

# Define Model

In [3]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Define Agent

In [4]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        if random.random() > self.epsilon:
            with torch.no_grad():
                action_values = self.model(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.randrange(self.action_size)

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state)
            next_state = torch.FloatTensor(next_state)
            action = torch.LongTensor([action])
            reward = torch.FloatTensor([reward])
            done = torch.FloatTensor([done])

            Q_values = self.model(state).gather(1, action.unsqueeze(1)).squeeze(1)
            Q_values_next = self.model(next_state).detach().max(1)[0]
            target = reward + (self.gamma * Q_values_next * (1 - done))
            
            loss = F.mse_loss(Q_values, target)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Train

In [6]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)
episodes = 1000
batch_size = 32

for e in range(episodes):
    state = env.reset()
    print(state)
    # state = np.reshape(state, [1, state_size])

    # for time in range(500):  # maximum time per episode
    #     action = agent.act(state)
    #     next_state, reward, done, _ = env.step(action)
    #     reward = reward if not done else -10
    #     next_state = np.reshape(next_state, [1, state_size])
    #     agent.remember(state, action, reward, next_state, done)
    #     state = next_state

    #     if done:
    #         print(f"Episode: {e}/{episodes}, Score: {time}, Epsilon: {agent.epsilon:.2}")
    #         break

    #     if len(agent.memory) > batch_size:
    #         agent.replay(batch_size)

(array([0.03686879, 0.01562724, 0.04996768, 0.02209998], dtype=float32), {})
(array([ 0.01496381, -0.01645143, -0.01406989,  0.04985406], dtype=float32), {})
(array([ 0.01815988, -0.00316581,  0.01886437, -0.0384768 ], dtype=float32), {})
(array([-0.00275258,  0.02492743,  0.02190015,  0.00340517], dtype=float32), {})
(array([-3.8428195e-02,  3.3804096e-02,  8.3485582e-05,  1.1303523e-02],
      dtype=float32), {})
(array([ 0.02940726, -0.01462459,  0.00430676, -0.01683379], dtype=float32), {})
(array([-0.03391486, -0.03779278,  0.04044477,  0.01513895], dtype=float32), {})
(array([0.02667845, 0.03698562, 0.04669544, 0.01390657], dtype=float32), {})
(array([ 0.03644611,  0.03094756, -0.04515837, -0.03733495], dtype=float32), {})
(array([ 0.01313856,  0.01378965, -0.04146266, -0.04755721], dtype=float32), {})
(array([ 0.0280135 , -0.02997413,  0.02774526, -0.02225336], dtype=float32), {})
(array([ 0.03168975, -0.04751481,  0.02630828, -0.01322251], dtype=float32), {})
(array([-0.0221573