# An Implementation of Deep Q-Learning
As described in [Human-level control through deep reinforcement learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf), published in Nature, 26 February 2015. Research performed by DeepMind.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext snakeviz

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cd /home/robert/repos/DeepRLAlgos/

/home/robert/repos/DeepRLAlgos


In [3]:
from copy import deepcopy
from time import sleep
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
from skimage.transform import rescale

from common.algorithm import RLAlgorithm

In [4]:
Experience = namedtuple("Experience", ["init_state", "act", "after_state", "reward", "terminal"])

class ReplayBuffer():
    def __init__(self, size):
        self._size = size
        self._storage = []
        self._len = self._index = 0
    
    def store(self, init_state, act, after_state, reward, terminal):
        if self._len == self._size:
            self._storage[self._index] = Experience(init_state, act, after_state, reward, terminal)
            self._index %= self._size
        else:
            self._storage.append(Experience(init_state, act, after_state, reward, terminal))
            self._len = self._index = self._index + 1
    
    def sample(self, n_samples=1):
        sample_indexes = np.random.choice(len(self._storage), n_samples, replace=False, )
        return [self._storage[i] for i in sample_indexes]

In [5]:
class DQNNetwork(nn.Module):
    def __init__(self, action_dim):
        super(DQNNetwork, self).__init__()
        self.conv_1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)
        self.conv_2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv_3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.linear_1 = nn.Linear(in_features=3456, out_features=512)
        self.head = nn.Linear(in_features=512, out_features=action_dim)

    def forward(self, obs):
        obs = F.relu(self.conv_1(obs))
        obs = F.relu(self.conv_2(obs))
        obs = F.relu(self.conv_3(obs))
        obs = obs.view(obs.size(0), -1) # resizing image to flat for linear
        obs = F.relu(self.linear_1(obs))
        return self.head(obs)

In [8]:
class DQN(RLAlgorithm):
    def __init__(self, env, random_seed=100, learning_rate=0.00025, buffer_size=1e6, discount=0.99):
        super().__init__(env, random_seed)
        self.net = DQNNetwork(self.act_dim).to(self.device)
        self.target_net = deepcopy(self.net)
        self.optimizer = torch.optim.RMSprop(params=self.net.parameters(), lr=learning_rate, momentum=0.95, eps=0.01)
        self.replay_buffer = ReplayBuffer(buffer_size)
        self.discount = discount
        self.replay_start_frames = int(1e4)
        self.minibatch_size = 32
        self.init_expl = 1
        self.fin_expl = 0.1
        self.fin_expl_frame = int(3e4)
        self._loss = []
    
    def _preprocessing(self, images):
        return torch.Tensor(
            [
                rescale(np.dot(image, [0.299, 0.587, 0.114]), 0.5, multichannel=False)
                 for image in images
            ]
        ).unsqueeze(0)
    
    def _step(self, action, render=False):
        if render:
            obs = []
            for _ in range(4):
                obs.append(self.env.step(action))
                self.env.render()
        else:
            obs = [self.env.step(action) for _ in range(4)]
        reward = np.sign(sum([ob[2] for ob in obs]))
        done = obs[-1][2]
        return self._preprocessing([ob[0] for ob in obs]), reward, done
    
    def _do_initial_warmup(self):
        frames = 0
        while True:
            obs = self.env.reset()
            observation = self._preprocessing([obs] * 4)
            done = False
            while not done:
                action, _, _ = self.choose_action(observation, epsilon=1)
                new_observation, reward, done = self._step(action)
                self.replay_buffer.store(observation, action, new_observation, reward, done)
                observation = new_observation
                frames += 4
                if frames % 1000 == 0:
                    print(f"initial frame {frames} completed")
                if frames > self.replay_start_frames:
                    return
            print("one full game has been played")

    def _do_minibatch(self, epsilon, init_obs):
        observation = init_obs
        for _ in range(8):
            action, _, _ = self.choose_action(observation, epsilon)
            new_observation, reward, done = self._step(action)
            self.replay_buffer.store(observation, action, new_observation, reward, done)
            if done:
                obs = self.env.reset()
                observation = self._preprocessing([obs] * 4)
            else:
                observation = new_observation
        samples = self.replay_buffer.sample(self.minibatch_size)
        loss = self.loss_function(samples)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return new_observation

    def _do_exploration_phase(self):
        obs = self.env.reset()
        observation = self._preprocessing([obs] * 4)
        for i in range(self.fin_expl_frame // 16):
            epsilon = self.init_expl - ((i/(self.fin_expl_frame // 16)) * (self.init_expl - self.fin_expl))
            if i % 100 == 0:
                print(f"starting exploration episode {i}, epsilon: {epsilon}, resetting target net")
                self.target_net = deepcopy(self.net)
            observation = self._do_minibatch(epsilon, observation)
        observation = self._do_minibatch(self.fin_expl, observation)
        print("ended exploration")

    def _do_action_phase(self):
        obs = self.env.reset()
        observation = self._preprocessing([obs] * 4)
        for i in range(self.fin_expl_frame // 16):
            if i % 100 == 0:
                print(f"starting action episode {i}, resetting target net")
                self.target_net = deepcopy(self.net)
            observation = self._do_minibatch(self.fin_expl, observation)

    def _set_lr(self, lr):
        self.optimizer.defaults["lr"] = lr
    
    def train(self, lr=None):
        if lr:
            self._set_lr(lr)
        print("performing initial warmup")
        self._do_initial_warmup()
        print("warmup complete")
        
        print("starting exploration")
        self._do_exploration_phase()
        print("exploration complete")
        
        print("starting action at min epsilon")
        self._do_action_phase()
        print("action finished")
    
    def loss_function(self, samples):
        sample_rewards = []
        sample_q_values = []
        for sample in samples:
            sample_q_values.append(self.choose_action(sample.init_state)[2][0][sample.act.detach()])
            with torch.no_grad():
                sample_rewards.append(
                    sample.reward
                    if sample.terminal
                    else sample.reward + self.discount * self.choose_action(sample.after_state, target_net=True)[1]
                )
        sample_q_values = torch.stack(sample_q_values)
        sample_rewards = torch.as_tensor(sample_rewards, device=self.device, dtype=torch.float)
        return torch.mean((sample_rewards - sample_q_values) ** 2)
    
    def choose_action(self, obs, epsilon=None, target_net=False):
        if epsilon == 1:
            return torch.as_tensor(np.random.choice(self.act_dim)).to(self.device), None, None
        if epsilon and np.random.binomial(1, epsilon, 1)[0]:
            return torch.as_tensor(np.random.choice(self.act_dim)).to(self.device), None, None
        if target_net:
            net = self.target_net
        else:
            net = self.net
        act_vals = net(obs.to(self.device))
        action = torch.argmax(act_vals)
        return action, act_vals[0][action], act_vals
    
    def act(self, steps, render=True):
        obs = self.env.reset()
        observation = self._preprocessing([obs] * 4)
        img = None
        if render:
            self.env.render()
        try:
            for i in range(steps):
                if render:
                    self.env.render()
                action, _, _ = self.choose_action(observation, 1)
                observation, _, done = self._step(action, render)
                if done:
                    break
        finally:
            self.env.close()
dqn = DQN("BreakoutNoFrameskip-v4")

In [9]:
dqn.train()

performing initial warmup
one full game has been played
initial frame 1000 completed
one full game has been played
initial frame 2000 completed
one full game has been played
initial frame 3000 completed
one full game has been played
one full game has been played
initial frame 4000 completed
one full game has been played
one full game has been played
initial frame 5000 completed
one full game has been played
initial frame 6000 completed
one full game has been played
one full game has been played
initial frame 7000 completed
one full game has been played
initial frame 8000 completed
one full game has been played
initial frame 9000 completed
one full game has been played
initial frame 10000 completed
warmup complete
starting exploration
starting exploration episode 0, epsilon: 1.0, resetting target net
starting exploration episode 100, epsilon: 0.952, resetting target net
starting exploration episode 200, epsilon: 0.904, resetting target net
starting exploration episode 300, epsilon: 0.85

In [12]:
dqn.act(8000)

In [10]:
dqn.replay_buffer._index

32509

In [None]:
losses = [t[3] for t in dqn.replay_buffer._storage]

TODO:
=====

* target network
* Make sure it's actually learning anything (more loggin, investigate tensorboard/pytorch specific solution?)
* visualisation
* assessing performance