## Replay Memory

In [426]:
import gym

USE_PIL = True
if USE_PIL:
    # you should use pillow-simd, as it is faster than stardand Pillow
    from PIL import Image
else:
    import cv2
    cv2.ocl.setUseOpenCL(False)

class ImageToPyTorch(gym.ObservationWrapper):
    """
    Change image shape to CWH
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        new_shape = (old_shape[-1], old_shape[0], old_shape[1])
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0, shape=new_shape, dtype=np.uint8)

    def observation(self, observation):
        return np.swapaxes(observation, 2, 0)
    
class ScaledFloatFrame(gym.ObservationWrapper):
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)

    def observation(self, observation):
        # careful! This undoes the memory optimization, use
        # with smaller replay buffers only.
        return np.array(observation).astype(np.float32) / 255.0
    
class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
        """
        Warp frames to 84x84 as done in the Nature paper and later work.
        If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
        observation should be warped.
        """
        super().__init__(env)
        self._width = width
        self._height = height
        self._grayscale = grayscale
        self._key = dict_space_key
        if self._grayscale:
            num_colors = 1
        else:
            num_colors = 3

        new_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(self._height, self._width, num_colors),
            dtype=np.uint8,
        )
        if self._key is None:
            original_space = self.observation_space
            self.observation_space = new_space
        else:
            original_space = self.observation_space.spaces[self._key]
            self.observation_space.spaces[self._key] = new_space
        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3

    def observation(self, obs):
        if self._key is None:
            frame = obs
        else:
            frame = obs[self._key]
        if USE_PIL:
            frame = Image.fromarray(frame)
            if self._grayscale:
                frame = frame.convert("L")
            frame = frame.resize((self._width, self._height))
            frame = np.array(frame)
        else:
            if self._grayscale:
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
            frame = cv2.resize(
                frame, (self._width, self._height),
                interpolation=cv2.INTER_AREA
            )
        if self._grayscale:
            frame = np.expand_dims(frame, -1)

        if self._key is None:
            obs = frame
        else:
            obs = obs.copy()
            obs[self._key] = frame
        return obs

env_name = 'SeaquestNoFrameskip-v4'
env = gym.make(env_name)
env = WarpFrame(env)
env = ImageToPyTorch(env)
#env = ScaledFloatFrame(env)

In [468]:
import torch
import torch.nn as nn
import numpy as np

class DRQN(nn.Module):
    """
    Neural Network, choosing actions
    """
    def __init__(self, n_in, n_out):
        super(DRQN, self).__init__()
        
        self.conv1 = nn.Conv2d(n_in[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        conv_out_size = self._get_conv_out(n_in)
        self.lstm = nn.LSTM(input_size=conv_out_size, hidden_size=512)
        self.linear = nn.Linear(512, 18)
        
        self.activation1 = nn.ReLU()
        self.activation2 = nn.ReLU()
        self.activation3 = nn.ReLU()
        self.activation4 = nn.ReLU()
        
        

        self.dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor #here

    def _get_conv_out(self, shape):
        o = self.conv1(torch.zeros(1, *shape))
        o = self.conv2(o)
        o = self.conv3(o)
        return int(np.prod(o.size())) #3136 (int)

    def forward(self, x, hidden):
        x = x.type(self.dtype)
        #for the conv just make batch_size * sequence and then split 
        sequence_length = x.shape[0]
        batch_size = x.shape[1]
        x = x.reshape(sequence_length * batch_size, 1, 84, 84)
        
        conv1_out = self.activation1(self.conv1(x))
        conv2_out = self.activation2(self.conv2(conv1_out))
        conv3_out = self.activation3(self.conv3(conv2_out))
        conv3_out_flattened = conv3_out.view(conv3_out.size()[0], -1)
        conv3_out_flattened = conv3_out_flattened.reshape(sequence_length, batch_size, 3136)
        
        lstm_output, lstm_hidden = self.lstm(conv3_out_flattened, hidden)
        lstm_output = self.activation4(lstm_output)
        output = self.linear(lstm_output)
        
        return output

In [428]:
import numpy as np
from typing import Tuple, List, Union
from collections import namedtuple, deque
import torch

Experience = namedtuple("Experience",
                           field_names = ["observation", "action", "reward"])

class ReplayMemory:
    """
    Replay Memory for Recurrent Neural Networks
    """
    def __init__(self, capacity: int, n_sequence_length: int):
        self.buffer = deque(maxlen=capacity)
        self.n_sequence_length = n_sequence_length
  

    def __len__(self):
        return len(self.buffer)
    
    
    def append(self, exp_sequence):
        #if episode not long enough (n_sequence_length), then reject it
        if(len(exp_sequence) > self.n_sequence_length):
            self.buffer.append(exp_sequence)
        
        
    def sample_episode(self):
        idx = np.random.choice(len(self))
        return self.buffer[idx]
        
        
    def sample(self, batch_size: int = 1) -> Tuple:
        #So a batch now contains batch_size *  sequences, right?
        
        #trajectories = []
        obs = []
        actions = []
        rewards = []
        for _ in range(batch_size):
            episode = self.sample_episode()

            start_idx = np.random.choice(len(episode)-self.n_sequence_length)
            trajectory = episode[start_idx : (start_idx + self.n_sequence_length)]
            obs_, actions_, rewards_ = zip(*[[t.observation, t.action, t.reward] for t in trajectory])
            
            actions.append(actions_)
            rewards.append(rewards_)
            obs.append(obs_)
            
            #trajectories.append(trajectory)
        
        return [obs, actions, rewards]
    
    def sample_(self, batch_size: int = 1) -> Tuple:
               #So a batch now contains batch_size *  sequences, right?
        
        trajectories = []
        for _ in range(batch_size):
            episode = self.sample_episode()

            start_idx = np.random.choice(len(episode)-self.n_sequence_length)
            trajectory = episode[start_idx : (start_idx + self.n_sequence_length)]
            trajectories.append(trajectory)
        
        return [obs, actions, rewards]

    
    
rm = ReplayMemory(10, 5)
for i in range(10):
    print('new episode')
    o = env.reset()
    done = False
    i = 0
    trajectory = []
    while(not done):
        i+=1
        action = np.random.choice(env.action_space.n)
        o_next, reward, done, _ = env.step(action)
        exp = Experience(o, action, reward)
        trajectory.append(exp)
        o = o_next
    rm.append(trajectory)

new episode
new episode
new episode
new episode
new episode
new episode
new episode
new episode
new episode
new episode


In [469]:
BATCH_SIZE = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
obs, actions, rewards = rm.sample(BATCH_SIZE)
obs = torch.tensor(obs).swapaxes(1, 0).to(device)
actions = torch.tensor(actions).swapaxes(1, 0).to(device)
rewards = torch.tensor(rewards).swapaxes(1, 0).to(device)
#m,b,/a/r

obs.shape

torch.Size([5, 1, 1, 84, 84])

In [471]:
network = DRQN(env.observation_space.shape, env.action_space.n).to(device)
hidden = (torch.randn(1, BATCH_SIZE, 512).to(device), torch.randn(1, BATCH_SIZE, 512).to(device))

out = network(obs, hidden)
out[-1][0].shape

#now only backprop some errors after burn-in

torch.Size([18])

In [449]:
network

DRQN(
  (conv1): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (lstm): LSTM(3136, 512)
)