# This notebook is used to make to optimize and organize code used for DQN
## Changes:
1. carry state as numpy array and transform into tensor when necessary 
2. enable use of multiple environments during training 
3. better organization of code
4. add proper logging (reward, time, loss)

## Things

1. Experience Source (get many environments and produces Experience into the buffer)
2. Buffer (stores experience and samples a batch)
3. training code (get batch, calculates loss and updates weights, also syncs nets)
4. Logger (handles logging trainign stats)


In [1]:
import gym

env = gym.make('LunarLander-v2')
print(env.observation_space)
print(env.action_space)

Box(-inf, inf, (8,), float32)
Discrete(4)


In [2]:
# Define Buffer and Experience Tuple
from collections import deque, namedtuple
import random
import numpy as np
import torch

#namedtuple to store an experience tuple used to train a DQN
Experience = namedtuple('Experience', ('state', 'action', 'next_state', 'reward', 'isdone'))

class ReplayBuffer:
    """
    ReplayBuffer stores Experience and handles sampling a batch
    """
    def __init__(self, capacity):
        
        self.buffer = deque(maxlen=capacity)
        
    def push(self, experience):
        self.buffer.append(experience)
        
    def __len__(self):
        return len(self.buffer)
    
    def sample(self, batch_size, as_tensor=False):
        """
        sample a batch
        """
        batch = random.sample(self.buffer, batch_size)
        states, actions, next_states, rewards, isdones = zip(*batch)
        states, actions, next_states, rewards, isdones = np.stack(states), np.stack(actions), np.stack(next_states), np.stack(rewards), np.stack(isdones)
        if as_tensor:
            return (torch.tensor(states), torch.tensor(actions, dtype=torch.int64),
                   torch.tensor(next_states), torch.tensor(rewards),
                   torch.BoolTensor(isdones)
                   )
        return states, actions, next_states, rewards, isdones


In [21]:
import torch.nn as nn


class ExperienceSource:
    
    def __init__(self, env, capacity=10000, device="cpu"):
        
        self.env = env
        self.state = self.env.reset()
            
        self.buffer = ReplayBuffer(capacity)
        
        self.device = device
        
        self.steps_done = 0
        self.episode_reward = 0
        
    def step(self, net):
        
        state = self.state
        state_tensor  = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            
        with torch.no_grad():
            act = net(state_tensor).max(1)[1].item()
        
        obs, reward, isdone, _ = self.env.step(act)
        exp = Experience(state, act, obs, reward, isdone)
        self.buffer.push(exp)
        
        self.steps_done+=1
        
        self.episode_reward+=reward
        if isdone:
            self.state = self.env.reset()
            episode_reward = self.episode_reward
            self.episode_reward = 0
            return episode_reward
        
        self.state = obs
        
        return None
        
    def get_steps(self):
        return self.steps_done
        
            
    def sample(self, batch_size, as_tensor=False):
        states, acts, next_states, rewards, isdones = self.buffer.sample(batch_size, as_tensor)
        if as_tensor:
            return(states.to(self.device), acts.to(self.device),
                   next_states.to(self.device), rewards.to(self.device),
                   isdones.to(self.device)
                  )
        


In [5]:



def calculate_loss(batch, net, target_net, gamma, doubleQ = False):
    
    states, acts, next_states, rewards, isdones = batch
    
    #print(states.shape)
    #print(acts.shape)
    #print(next_states.shape)
    #print(rewards.shape)
    #print(isdones.shape)
    
    batch_size = states.shape[0]
    
    state_action_values = net(states).gather(1, acts.unsqueeze(1)).squeeze(-1)
    
    #print(state_action_values.shape)
    with torch.no_grad():
        
        if doubleQ:
            
            next_state_acts = net(next_states).max(1)[1]
            next_state_values = target_net(next_states).gather(1, next_state_acts.unsqueeze(1)).squeeze(1)
            
        else:
            next_state_values = target_net(next_states).max(1)[0]
        
    next_state_values[isdones] = 0.0
    #print(next_state_values)
    #print(next_state_values.shape)
    #mask = torch.ones(batch_size)-isdones
    expected_Q_values = next_state_values.detach()*gamma+rewards
    
    loss = nn.MSELoss()(state_action_values.float(), expected_Q_values.float())
    
    return loss

def sync_nets(net, target_net):
    target_net.load_state_dict(net.state_dict())
    
    

In [6]:
#Models

import math
import torch.nn as nn
import torch.nn.functional as F

class NoisyLinear(nn.Linear):
    """
    Implements noisy linear layer. 
    taken from: https://github.com/Shmuma/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter07/lib/dqn_model.py
    
    """
    def __init__(self, in_features, out_features, sigma_init=0.018, bias=True):
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
        self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
        self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
        if bias:
            self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
            self.register_buffer("epsilon_bias", torch.zeros(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(3 / self.in_features)
        self.weight.data.uniform_(-std, std)
        self.bias.data.uniform_(-std, std)

    def forward(self, input):
        self.epsilon_weight.normal_()
        bias = self.bias
        if bias is not None:
            self.epsilon_bias.normal_()
            bias = bias + self.sigma_bias * self.epsilon_bias.data
        return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)

class BasicNet(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_size = 64):
        super(BasicNet, self).__init__()
        
        self.net = nn.Sequential(NoisyLinear(input_size, hidden_size),
                                nn.ReLU(), NoisyLinear(hidden_size, hidden_size),
                                nn.ReLU(), NoisyLinear(hidden_size, output_size)
                                )
    def forward(self, input):
        return self.net(input)
    
class BasicDuelingNet(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_size = 128):
        super(BasicDuelingNet, self).__init__()
        
        self.adv = nn.Sequential(NoisyLinear(input_size, hidden_size),
                                nn.ReLU(), NoisyLinear(hidden_size, hidden_size),
                                nn.ReLU(), NoisyLinear(hidden_size, output_size)
                                )
        
        self.val = nn.Sequential(NoisyLinear(input_size, hidden_size),
                        nn.ReLU(), NoisyLinear(hidden_size, hidden_size),
                        nn.ReLU(), NoisyLinear(hidden_size, 1))
    def forward(self, input):
        val = self.val(input)
        adv = self.adv(input)
        return val+adv-adv.mean()
    
class DuelingBoxerNoisyNet(nn.Module):
    """
    A convolutional network to use in DQN
    with noisy linear layers
    """
    
    def __init__(self, input_shape, output_shape):
        super(DuelingBoxerNoisyNet, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
    
        out_size = self._get_conv_out(input_shape)
    
        self.adv = nn.Sequential(
            NoisyLinear(out_size, 512),
            nn.ReLU(),
            NoisyLinear(512, output_shape)
        )
        
        self.val = nn.Sequential(
            NoisyLinear(out_size, 512),
            nn.ReLU(),
            NoisyLinear(512, 1)
        )
    
        
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        val = self.val(x)
        adv = self.adv(x)
        return val+adv-adv.mean()
        

In [9]:
#environment wrappers
import gym
import numpy as np
import torchvision.transforms as T
from gym.spaces import Box
from gym.wrappers import FrameStack
import torch
import time
from PIL import Image

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip
        
    def step(self, action):
        total_reward = 0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward+=reward
            if done:
                break
            
        return obs, total_reward, done, info
    
class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)


    def observation(self, observation):
        observation = observation[:, :, 0] * 0.299 + observation[:, :, 1] * 0.587 + observation[:, :, 2] * 0.114
        return observation/255.0
    
class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape, crop=None):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape=(shape, shape)
        else:
            self.shape=tuple(shape)
        
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space=Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
        self.crop = crop
        
    def observation(self,observation):
        if self.crop != None:
            observation = observation[self.crop[0]:-self.crop[0], self.crop[1]:-self.crop[1]]
        #transforms = T.Compose(
        #    [T.Resize(self.shape), T.Normalize(0, 255)]
        #)
        #observation = transforms(observation)
        img = Image.fromarray(observation)
        img2 = img.resize(self.shape, Image.NEAREST)
        return np.asarray(img2)
                         
class TensorFromObs(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = env.observation_space
    def observation(self, observation):
        return torch.tensor(observation.__array__(), dtype=torch.float)
    
class ArrayFromObs(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = env.observation_space
    def observation(self, observation):
        return observation.__array__()
    
def showimagetensor(t):
    im = Image.fromarray(np.uint8(t.numpy()*255))
    im.show()
    
skipframe = 4
env = gym.make("PongNoFrameskip-v4")

def make_envs(name, skipframe=4, count=1):
    envs = []
    for _ in range(count):
        env = gym.make("PongNoFrameskip-v4")
        env = SkipFrame(env, skipframe)
        env = GrayScaleObservation(env)
        env = ResizeObservation(env, shape=84)
        env = FrameStack(env, num_stack=skipframe)
        env = ArrayFromObs(env)
        envs.append(env)
    return envs

In [24]:
import time

skipframe = 4
env = gym.make("PongNoFrameskip-v4")
env = SkipFrame(env, skipframe)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=skipframe)
env = ArrayFromObs(env)


net = DuelingBoxerNoisyNet((4, 84, 84), 6).cuda()
target_net = DuelingBoxerNoisyNet((4, 84, 84), 6).cuda()
target_net.load_state_dict(net.state_dict())

optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)


sync_interval = 1000
gamma = 0.99

source = ExperienceSource(env, capacity = 50000, device="cuda")

threshold = 10000

batch_size = 32
t1 = time.time()

num_iters = 0
episode_rewards = []
info_interval = 10

def get_mean_reward(rewards, interval):
    return sum(rewards[-interval:])/interval

while True:
    #print("starting iter")
    episode_reward = source.step(net)
    

        
    if episode_reward != None:
        print(f"episode_done:{episode_reward}")
        episode_rewards.append(episode_reward)
        if len(episode_rewards)%info_interval==0:

            print(f"{len(episode_rewards)} episode played, mean_reward:{get_mean_reward(episode_rewards, info_interval)} time:{time.time()-t1}")
    
    if source.get_steps() < threshold:
        continue
    
    batch = source.sample(batch_size, as_tensor=True)
    
    optimizer.zero_grad()
    loss = calculate_loss(batch, net, target_net, gamma, doubleQ=False)
    loss.backward()
    optimizer.step()
    num_iters+=1
    if num_iters%sync_interval==0:
        target_net.load_state_dict(net.state_dict())


episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
10 episode played, mean_reward:-21.0 time:32.793763160705566
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-20.0
episode_done:-21.0
episode_done:-21.0
episode_done:-20.0
episode_done:-21.0
episode_done:-21.0
20 episode played, mean_reward:-20.8 time:156.3961706161499
episode_done:-21.0
episode_done:-21.0
episode_done:-20.0
episode_done:-20.0
episode_done:-21.0
episode_done:-20.0
episode_done:-20.0
episode_done:-20.0
episode_done:-21.0
episode_done:-20.0
30 episode played, mean_reward:-20.4 time:323.98670625686646
episode_done:-21.0
episode_done:-21.0
episode_done:-21.0
episode_done:-20.0
episode_done:-19.0
episode_done:-21.0
episode_done:-20.0
episode_done:-20.0
episode_done:-21.0
episode_done:-21.0
40 episode played, mean_reward:-20.5 time:486.376250505447

KeyboardInterrupt: 

In [25]:
torch.save(net.state_dict(), 'DuelingNoisyConvNewLib196.pt')