In [None]:
import torch
import torch.nn as nn


class DQN(nn.Module):
    def __init__(self, d_in, n_action):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=d_in, out_channels=16, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.lr1 = nn.Linear(in_features=9*9*32, out_features=256)
        self.output = nn.Linear(in_features=256, out_features=n_action)

    def forward(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.lr1(x))
        x = self.output(x)
        return x


In [None]:
cuda0 = torch.device('cuda:0')
# !pip install gym[atari,accept-rom-license]==0.21.0

In [None]:
from collections import deque
import numpy as np
import torch
from torch import optim
import cv2
import random


MAX_REPLAY_SIZE = 30000
MIN_REPLAY_SIZE = 20000
BATCH_SIZE = 64


class Agent:
    def __init__(self, env):
        self.env = env
        self.state_h = env.observation_space.shape[0]
        self.state_w = env.observation_space.shape[1]
        self.action_n = env.action_space.n
        self.replay = deque(maxlen=MAX_REPLAY_SIZE)
        self.step = 0
        self.epsilon = 1

        self.mdl = DQN(4, self.action_n).to(cuda0)
        self.target_mdl = DQN(4, self.action_n).to(cuda0)
        self.optimizer = optim.Adam(self.mdl.parameters(), lr=0.0001)

    def processImage(self, img):
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img_gray = cv2.resize(img_gray, (84, 116))
        return img_gray[18:102]

    def predict(self, state):
        if random.uniform(0, 1) <= self.epsilon:
          return random.randrange(self.action_n)
        with torch.no_grad():
            state_t = torch.tensor(state, dtype=torch.float, device=cuda0)
            vals = self.mdl.forward(state_t)
            next_action = torch.argmax(vals).item()

        return next_action

    def collect_experience(self, state, action, reward, next_state, done):
        self.replay.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.replay) < MIN_REPLAY_SIZE:
            return 0
        s, a, r, n_s, done = zip(*random.sample(self.replay, BATCH_SIZE))
        s = np.concatenate(s)
        n_s = np.concatenate(n_s)
        r = torch.tensor(r, dtype=torch.float, device=cuda0)
        done = torch.tensor(done, dtype=torch.float, device=cuda0)
        q_vals_all = self.mdl(torch.tensor(s, dtype=torch.float, device=cuda0))
        next_q_vals_all = self.target_mdl(torch.tensor(n_s, dtype=torch.float, device=cuda0))
        q_vals_selected = q_vals_all.gather(1, torch.tensor(a, dtype=torch.int64, device=cuda0).unsqueeze(1)).squeeze(1).to(cuda0)
        next_q_vals_max = next_q_vals_all.gather(1, next_q_vals_all.max(1)[1].unsqueeze(1)).squeeze(1).to(cuda0)
        G = r + 0.97 * next_q_vals_max * (1 - done)

        loss = (q_vals_selected - G.detach()).pow(2).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.step += 1

        if self.step % 50 == 0:
          self.target_mdl.load_state_dict(self.mdl.state_dict())

        return loss

    def adaptEpsilon(self):
        if self.epsilon > 0.05:
            self.epsilon *= 0.99


In [None]:
import gym
import time
import json

In [None]:
env = gym.make('PongDeterministic-v4')
agt = Agent(env)

# agt.mdl.load_state_dict(torch.load('pong_cnn_700.pkl'))
# agt.target_mdl.load_state_dict(torch.load('pong_cnn_700.pkl'))
# agt.epsilon = 0

In [None]:
# agt.epsilon = 0.1

In [None]:
total_step = 0
all_rewards = []
last_50_rewards = deque(maxlen=50)
avg_rewards = []

t0 = time.time()
for episode in range(1000):
    # agt.replay.clear()
    startT = time.time()

    state = agt.processImage(env.reset())
    state = np.stack((state, state, state, state))[None, :]
    total_r = 0
    total_loss = torch.tensor([0.0], dtype=torch.float, device=cuda0)
    for step in range(100000):
        # env.render()

        action = agt.predict(state)
        next_state, reward, done, _ = env.step(action)
        next_state = agt.processImage(next_state)
        next_state = np.stack((next_state,state[0][1],state[0][2],state[0][3]))[None, :]
        agt.collect_experience(state, action, reward, next_state, done)
        state = next_state

        loss = agt.train()
        total_loss += loss
        total_r += reward
        total_step += 1
        if total_step % 2000 == 0:
          agt.adaptEpsilon()

        if done:
            # Save model
            finishT = time.time()
            all_rewards.append(total_r)
            last_50_rewards.append(total_r)
            avg_r = np.mean(last_50_rewards)
            avg_rewards.append(avg_r)
            if (episode+1) % 50 == 0:
                w_path = './pong_cnn_' + str(episode+1) + '.pkl'
                epsilon_path = './pong_cnn_' + str(episode+1) + '.json'
                epsilon_dict = {'epsilon': agt.epsilon, 'rewards': all_rewards, 'average': avg_rewards}
                torch.save(agt.mdl.state_dict(), w_path)
                f_epsilon = open(epsilon_path, 'w')
                json.dump(epsilon_dict, f_epsilon)
                f_epsilon.close()

            print("Total step:", total_step)
            print("Episode", episode, "    Training time:", finishT - startT, " Total time elapsed:", finishT - t0)
            print("Reward:", total_r, " Average reward:", avg_r, " Loss:", total_loss.item(), " Epsilon:", agt.epsilon)

            break

Total step: 901
Episode 0     Training time: 0.6121261119842529  Total time elapsed: 0.6126282215118408
Reward: -20.0  Average reward: -20.0  Loss: 0.0  Epsilon: 1
Total step: 1786
Episode 1     Training time: 0.5781936645507812  Total time elapsed: 1.196399211883545
Reward: -21.0  Average reward: -20.5  Loss: 0.0  Epsilon: 1
Total step: 2674
Episode 2     Training time: 0.5843582153320312  Total time elapsed: 1.7845466136932373
Reward: -20.0  Average reward: -20.333333333333332  Loss: 0.0  Epsilon: 0.99
Total step: 3661
Episode 3     Training time: 0.667597770690918  Total time elapsed: 2.4559271335601807
Reward: -19.0  Average reward: -20.0  Loss: 0.0  Epsilon: 0.99
Total step: 4595
Episode 4     Training time: 0.6381363868713379  Total time elapsed: 3.098174810409546
Reward: -20.0  Average reward: -20.0  Loss: 0.0  Epsilon: 0.9801
Total step: 5572
Episode 5     Training time: 0.6534976959228516  Total time elapsed: 3.7556774616241455
Reward: -20.0  Average reward: -20.0  Loss: 0.0  