## Replay Memory

In [1]:
import numpy as np
from typing import Tuple, List, Union
from collections import namedtuple, deque

Experience = namedtuple("Experience",
                           field_names = ["observation", "action", "reward"])

class ReplayMemory:
    """
    Replay Memory for Recurrent Neural Networks
    """
    def __init__(self, capacity: int, n_sequence_length: int):
        self.buffer = deque(maxlen=capacity)
        self.n_sequence_length = n_sequence_length
  

    def __len__(self):
        return len(self.buffer)
    
    
    def append(self, exp_sequence):
        #if episode not long enough (n_sequence_length), then reject it
        if(len(exp_sequence) > self.n_sequence_length):
            self.buffer.append(exp_sequence)
        
        
    def sample_episode(self):
        idx = np.random.choice(len(self))
        return self.buffer[idx]
        
        
    def sample(self, batch_size: int = 1) -> Tuple:
        #So a batch now contains batch_size *  sequences, right?
        
        trajectories = []
        for _ in range(batch_size):
            episode = self.sample_episode()

            start_idx = np.random.choice(len(episode)-self.n_sequence_length)
            trajectory = episode[start_idx : (start_idx + self.n_sequence_length)]
            trajectories.append(trajectory)
        
        return trajectories

    
    
rm = ReplayMemory(10, 3)

In [2]:
episode = 0

#Episode
while(True):
    episode+=1
    print("Start episode %d" % (episode))
    done = False
    episode_timestep = 0
    
    #Timestep
    while(not done):
        episode_timestep += 1
        exp_sequence = []
        
        #simulate length of episode
        n_timesteps = np.random.rand() * 100
        n_timesteps = int(n_timesteps) + 1
        
        #simulate episode
        for i in range(n_timesteps):
            exp_sequence.append(Experience(i, i+2, i))
            
        done = True
    rm.append(exp_sequence)
         
    if(episode>=10):
        break

Start episode 1
Start episode 2
Start episode 3
Start episode 4
Start episode 5
Start episode 6
Start episode 7
Start episode 8
Start episode 9
Start episode 10


In [3]:
rm.sample(2)[0]

[Experience(observation=74, action=76, reward=74),
 Experience(observation=75, action=77, reward=75),
 Experience(observation=76, action=78, reward=76)]

In [4]:
def calc_loss(self, batch):
    pass

In [5]:
batch = rm.sample(2)
batch

[[Experience(observation=27, action=29, reward=27),
  Experience(observation=28, action=30, reward=28),
  Experience(observation=29, action=31, reward=29)],
 [Experience(observation=32, action=34, reward=32),
  Experience(observation=33, action=35, reward=33),
  Experience(observation=34, action=36, reward=34)]]

## Let's first try to calc loss without batch, i.e. batch_size = 1

In [6]:
trajectory = rm.sample(1)[0]
trajectory

[Experience(observation=20, action=22, reward=20),
 Experience(observation=21, action=23, reward=21),
 Experience(observation=22, action=24, reward=22)]

In [7]:
obs, actions, rewards = \
            zip(*[[t.observation, t.action, t.reward] for t in trajectory])
obs = np.array(obs)
actions = np.array(actions)
rewards = np.array(rewards)

In [8]:
obs

array([20, 21, 22])

In [9]:
#feed obs into lstm, output for qval
qvals = 0

## DRQN Architecture

In [43]:
import torch
import torch.nn as nn
import numpy as np

class DRQN(nn.Module):
    """
    Neural Network, choosing actions
    """
    def __init__(self, n_in, n_out):
        super(DRQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(n_in[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(n_in)

        self.lstm = nn.LSTM(conv_out_size, 512)
        self.fc = nn.Linear(512, n_out)
        
        self.fc0 = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_out)
        )
        
        self.hidden = np.repeat(0, 512)
        #lstm needs hidden input
        
        
        self.conv.apply(self.init_weights)
        self.fc.apply(self.init_weights)

        self.dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor #here

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        x = x.type(self.dtype) 
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc0(conv_out)
        #lstm = self.lstm(conv_out, self.hidden)
        #lstm_relu = nn.ReLU(lstm)
        #return self.fc(lstm_relu)

    def init_weights(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)

In [44]:
import gym

USE_PIL = True
if USE_PIL:
    # you should use pillow-simd, as it is faster than stardand Pillow
    from PIL import Image
else:
    import cv2
    cv2.ocl.setUseOpenCL(False)

class ImageToPyTorch(gym.ObservationWrapper):
    """
    Change image shape to CWH
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        new_shape = (old_shape[-1], old_shape[0], old_shape[1])
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0, shape=new_shape, dtype=np.uint8)

    def observation(self, observation):
        return np.swapaxes(observation, 2, 0)
    
class ScaledFloatFrame(gym.ObservationWrapper):
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)

    def observation(self, observation):
        # careful! This undoes the memory optimization, use
        # with smaller replay buffers only.
        return np.array(observation).astype(np.float32) / 255.0
    
class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
        """
        Warp frames to 84x84 as done in the Nature paper and later work.
        If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
        observation should be warped.
        """
        super().__init__(env)
        self._width = width
        self._height = height
        self._grayscale = grayscale
        self._key = dict_space_key
        if self._grayscale:
            num_colors = 1
        else:
            num_colors = 3

        new_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(self._height, self._width, num_colors),
            dtype=np.uint8,
        )
        if self._key is None:
            original_space = self.observation_space
            self.observation_space = new_space
        else:
            original_space = self.observation_space.spaces[self._key]
            self.observation_space.spaces[self._key] = new_space
        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3

    def observation(self, obs):
        if self._key is None:
            frame = obs
        else:
            frame = obs[self._key]
        if USE_PIL:
            frame = Image.fromarray(frame)
            if self._grayscale:
                frame = frame.convert("L")
            frame = frame.resize((self._width, self._height))
            frame = np.array(frame)
        else:
            if self._grayscale:
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
            frame = cv2.resize(
                frame, (self._width, self._height),
                interpolation=cv2.INTER_AREA
            )
        if self._grayscale:
            frame = np.expand_dims(frame, -1)

        if self._key is None:
            obs = frame
        else:
            obs = obs.copy()
            obs[self._key] = frame
        return obs

env_name = 'SeaquestNoFrameskip-v4'
env = gym.make(env_name)
env = WarpFrame(env)
env = ImageToPyTorch(env)
#env = ScaledFloatFrame(env)

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
obs = env.reset()
obs.shape

(1, 84, 84)

In [46]:
obs = torch.tensor(obs).to(device)

In [47]:
#add batch dimension 
obs = obs.unsqueeze(0)

In [48]:
obs.shape

torch.Size([1, 1, 84, 84])

In [49]:
policy_net = DRQN(env.observation_space.shape, env.action_space.n).to(device)

In [50]:
policy_net(obs)

tensor([[ 0.8340,  2.4123,  0.4724,  0.2797,  0.1478, -0.3460, -0.6617,  1.1070,
          2.0227,  0.7061, -0.6693, -0.4380, -1.0374, -0.5745, -0.7978,  1.4820,
          0.2428, -1.0787]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
#obviously i need the hidden states!

In [None]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

In [None]:
inputs

In [None]:
hidden

In [None]:
#i need one hidden input for each LSTM, if there are 3 LSTM, then 3 obv.