In [2]:
# look how to speed up training (smart device choosing)
import multiprocessing
import os
import torch
f'cpu: {multiprocessing.cpu_count()}, memory: {(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES"))/(1024.**3)}, gpu: {torch.cuda.get_device_name(0)}'

'cpu: 2, memory: 12.686656951904297, gpu: Tesla P100-PCIE-16GB'

In [1]:
!apt-get install -y xvfb python-opengl x11-utils > /dev/null 2>&1
!pip install -y pyvirtualdisplay pyglet==v1.5.0 > /dev/null 2>&1
# !pip uninstall -y pyglet

In [2]:
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 600))
display.start()

ModuleNotFoundError: ignored

In [18]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/Colab Workspace/dqn
!git config --global user.email "rabrg96@gmail.com"
!git config --global user.name "Ryan Greene"

!git status

Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Workspace/dqn
On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   DQN.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add . && 

In [3]:
from collections import deque
import random


class UniformReplayMemory():

  def __init__(self, memory_size):
    self.memories = deque([], memory_size)

  def append(self, memory):
    self.memories.append(memory)

  def sample(self, k):
    return random.sample(self.memories, k)

  def __len__(self):
    return len(self.memories)

In [4]:
import random
import math


class FixedEpsilonGreedyStrategy():

  def __init__(self, epsilon=0.1):
    self.epsilon = epsilon

  def is_exploit(self):
    return random.uniform(0, 1) >= self.epsilon


class AnnealingEpsilonGreedyStrategy():

  def __init__(self, env, starting_epsilon=1.0, ending_epsilon=0.05, epsilon_decay=5000):
    self.env = env
    self.starting_epsilon = starting_epsilon
    self.ending_epsilon = ending_epsilon
    self.epsilon_decay = epsilon_decay

  def is_exploit(self):
    random_action_probability = self.ending_epsilon + (self.starting_epsilon - self.ending_epsilon) * math.exp(-1. * env.get_total_steps() / self.epsilon_decay)
    return random.uniform(0, 1) >= random_action_probability

In [15]:
import copy
import numpy as np
import torch
import fastprogress
import math


class DQN():

  def __init__(self, model, env, epsilon_greedy_strategy,
               replay_memory=UniformReplayMemory(memory_size=100000),
               batch_size=128, target_model_update_delay=100, use_ddqn_loss=True, gamma=0.99):
    self.env = env
    self.replay_memory = replay_memory
    self.epsilon_greedy_strategy = epsilon_greedy_strategy
    self.model = model
    self.update_target_model_weights()
    self.batch_size = batch_size
    self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    self.target_model_update_delay = target_model_update_delay
    self.use_ddqn_loss = use_ddqn_loss
    self.gamma = gamma

  def get_action(self, obs):
    if self.epsilon_greedy_strategy.is_exploit():
      self.model.eval()
      with torch.no_grad():
        q_value_max = self.model(torch.Tensor(obs)).max(dim=0)
      self.model.train()
      return q_value_max[1].item()
    return self.env.action_space.sample()

  def update_model_weights(self):
      batch = self.replay_memory.sample(self.batch_size)
      batch = {key: np.stack([b[key] for b in batch]) for key in batch[0].keys()}

      q_value = self.model(torch.Tensor(batch['obs'])).gather(1, torch.Tensor(batch['action']).long().unsqueeze(1)).squeeze(1)
      target_next_q_values = self.target_model(torch.Tensor(batch['next_obs']))
      dones = torch.Tensor(batch['done'])
      future_reward_weighting = torch.Tensor([self.gamma])

      if self.use_ddqn_loss:
        next_q_values = self.model(torch.Tensor(batch['next_obs']))
        next_q_value = target_next_q_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
      else:
        next_q_value = target_next_q_values.max(dim=1)[0]

      target_q_value = torch.Tensor(batch['reward']) + future_reward_weighting * next_q_value * (1 - dones)

      loss = torch.nn.functional.smooth_l1_loss(q_value, target_q_value)
      self.optimizer.zero_grad()
      loss.backward()
      for param in self.model.parameters():
        param.grad.data.clamp_(-1, 1)
      self.optimizer.step()

      return loss.item()

  def update_target_model_weights(self):
    # TODO: no_grad too?
    self.target_model = copy.deepcopy(self.model)
    self.target_model.eval()

  def step(self, obs):
    action = self.get_action(obs)
    next_obs, reward, done, info = env.step(action)
    self.replay_memory.append({'obs': obs, 'action': action, 'reward': reward, 'next_obs': next_obs, 'done': done})
    return next_obs, done

  def plot_reward_update(self, epoch, epochs, mb, reward):
    """ dynamically print the reward plot during the training loop.
        expects epoch to start from 1.
    """
    x = range(1, epoch+1)
    graphs = [[x,reward]]
    x_margin = 0.2
    y_margin = 0.05
    x_bounds = [1-x_margin, epochs+x_margin]
    y_bounds = [np.min(reward)-y_margin, np.max(reward)+y_margin]

    mb.update_graph(graphs, x_bounds)

  def learn(self, n_episodes=500):
    mb = fastprogress.master_bar(range(1, n_episodes + 1))
    for episode in mb:
      obs = env.reset()
  
      for step in range(1000):
        obs, done = self.step(obs)

        if len(self.replay_memory) >= self.batch_size:
          self.update_model_weights()

        if done:
          break
      if episode % self.target_model_update_delay == 0:
        self.update_target_model_weights()
      # self.plot_reward_update(episode, n_episodes, mb, env.get_episode_rewards())
    env.close()

In [17]:
import gym
import torch


env = gym.wrappers.Monitor(gym.make('CartPole-v1'), './monitor/', force=True, video_callable=False)
model = torch.nn.Sequential(
    torch.nn.Linear(env.observation_space.shape[0], 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, env.action_space.n),
)

dqn = DQN(model, env, AnnealingEpsilonGreedyStrategy(env))
%lprun -f dqn.update_model_weights dqn.learn(200)

In [11]:
!pip install line_profiler
%load_ext line_profiler

Collecting line_profiler
  Downloading line_profiler-3.3.0-cp37-cp37m-manylinux2010_x86_64.whl (63 kB)
[?25l[K     |█████▏                          | 10 kB 39.5 MB/s eta 0:00:01[K     |██████████▎                     | 20 kB 44.9 MB/s eta 0:00:01[K     |███████████████▍                | 30 kB 45.8 MB/s eta 0:00:01[K     |████████████████████▌           | 40 kB 27.7 MB/s eta 0:00:01[K     |█████████████████████████▋      | 51 kB 16.1 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 61 kB 14.6 MB/s eta 0:00:01[K     |████████████████████████████████| 63 kB 2.4 MB/s 
Installing collected packages: line-profiler
Successfully installed line-profiler-3.3.0
