In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
import gc
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
#Change the working directory
os.chdir('/content/gdrive/My Drive/Colab Notebooks/PongwithDQN')
os.listdir()

['lib', 'runs', 'PongNoFrameskip-v4-best.dat', 'Pong.ipynb']

In [0]:
from lib import wrappers
from lib import dqn_model

In [5]:
pip install tensorboardX

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/35/f1/5843425495765c8c2dd0784a851a93ef204d314fc87bcc2bbb9f662a3ad1/tensorboardX-2.0-py2.py3-none-any.whl (195kB)
[K     |█▊                              | 10kB 26.6MB/s eta 0:00:01[K     |███▍                            | 20kB 6.4MB/s eta 0:00:01[K     |█████                           | 30kB 7.4MB/s eta 0:00:01[K     |██████▊                         | 40kB 5.9MB/s eta 0:00:01[K     |████████▍                       | 51kB 6.8MB/s eta 0:00:01[K     |██████████                      | 61kB 8.0MB/s eta 0:00:01[K     |███████████▊                    | 71kB 7.9MB/s eta 0:00:01[K     |█████████████▍                  | 81kB 8.2MB/s eta 0:00:01[K     |███████████████                 | 92kB 9.1MB/s eta 0:00:01[K     |████████████████▊               | 102kB 8.4MB/s eta 0:00:01[K     |██████████████████▍             | 112kB 8.4MB/s eta 0:00:01[K     |████████████████████            | 122kB 8.

In [0]:
import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

In [0]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 10 # We will train the model until this reward is achieved
#MEAN_REWARD_BOUND = 19.5
GAMMA = 0.99  # Gamma value
BATCH_SIZE = 32 
REPLAY_SIZE = 10000 #Maximum Capacity of the buffer
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000 # How frequently we update target model weights with actual model weights
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

In [0]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [0]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [0]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
      # Play a step using epsilon greedy Policy and return the reward.
        done_reward = None
        # Epsilon Greesy Policy
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [0]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    # apply the target network to our next state observations and calculate the maximum Q-value along the same action dimension 1.
    # Function max() returns both maximum values and indices of those values (so it calculates both max and argmax),
    # which is very convenient. However, in this case, we're interested only in values, so we take the first entry of the result.
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    # detach the value from its computation graph to prevent gradients from flowing into the neural network used to calculate 
    # Q approximation for next states.
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [0]:
device = torch.device("cuda")

In [0]:
# Training take 1 - 1.5 hours. This Step can be skipped and you can directly go to the next step. We will use a pre trained model.
env = wrappers.make_env(DEFAULT_ENV_NAME)
# Create two networks - Training network (net) and target network (tgt_net)
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

while True:
    frame_idx += 1
    # Reduce the value of Epsilon for every timestep
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    # Agent plays a Step and Collects the reward
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", mean_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "mean_score10.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

    if len(buffer) < REPLAY_START_SIZE:
        continue
    # Fixed Q Targets : For every 1000 steps update the Target weights with the weigths from training network
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
writer.close()


In [0]:
import gym
import time
import argparse
import numpy as np

import torch

from lib import wrappers
from lib import dqn_model

import collections

DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
FPS = 25

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [17]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

In [0]:
def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
env = wrap_env(wrappers.make_env(DEFAULT_ENV_NAME))

In [21]:
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
net.load_state_dict(torch.load(DEFAULT_ENV_NAME + "-best.dat"))
state = env.reset()
total_reward = 0.0
c = collections.Counter()

while True:
  start_ts = time.time()  
  state_v = torch.tensor(np.array([state], copy=False))
  q_vals = net(state_v).data.numpy()[0]
  env.render()
  action = np.argmax(q_vals)
  c[action] += 1
  state, reward, done, _ = env.step(action)
  total_reward += reward
  if done:
    break
env.close()
show_video()  
print("Total reward: %.2f" % total_reward)
print("Action counts:", c)
if True:
  env.env.close()

Total reward: 21.00
Action counts: Counter({3: 613, 4: 422, 1: 222, 0: 154, 5: 121, 2: 98})
