## Replay Memory

* Store observations $o \in \Omega$ instead of states ✓
* Store recurrent state in replay to initialize the network at training time (optional)
* Instead of storing $(s, a, r, s')$ tuples, store sequences of $(s, a, r)$ with fixed-length (m=80). ✓ 
    * Adjacent sequences should overlap by $l=40$, because this is used for burn-in.  ✓
    * Never cross episode Boundries!

In [156]:
import numpy as np
from typing import Tuple, List, Union
from collections import namedtuple, deque

Experience = namedtuple("Experience",
                           field_names = ["observation", "action", "reward"])

class ReplayMemory:
    def __init__(self, capacity: int, n_burn_in: int, n_sequence_length: int):
        self.buffer = deque(maxlen=capacity)
        self.n_burn_in = n_burn_in
        self.n_sequence_length = n_sequence_length
  

    def __len__(self):
        return len(self.buffer)
    
    
    def append(self, exp_sequence):
        #length of sequence should be l
        assert(len(exp_sequence) == self.n_sequence_length)
        
        #sequences should overlap by m
        assert(1==1)
        
        self.buffer.append(exp_sequence)
        
        
    def sample(self, batch_size: int = 1) -> Tuple:
        idxs = np.random.choice(len(self), batch_size, replace=False)
        
        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in idxs])
        
        return np.array(states), np.array(actions), \
            np.array(rewards, dtype=np.float32), np.array(dones, dtype=bool), \
            np.array(next_states)

## Append with window l, m

In [157]:
tmp = 0
n_burn_in = 3
n_sequence_length = 5
rm = ReplayMemory(10, n_burn_in, n_sequence_length)
exp = []

for i in range(1,11):
    exp.append([Experience(i, i, i)])
    if(i<=n_sequence_length and i%n_sequence_length == 0 and i>0):
        #print('first append with i=%d' % i)
        rm.append(exp)
        exp = exp[-n_burn_in:]
        tmp = 0
    if(i>n_sequence_length and tmp==n_sequence_length-n_burn_in):
        #print('append with i=%d' % i) 
        rm.append(exp)
        exp = exp[-n_burn_in:]
        tmp = 0
    tmp = tmp+1

In [158]:
rm.buffer

deque([[[Experience(observation=1, action=1, reward=1)],
        [Experience(observation=2, action=2, reward=2)],
        [Experience(observation=3, action=3, reward=3)],
        [Experience(observation=4, action=4, reward=4)],
        [Experience(observation=5, action=5, reward=5)]],
       [[Experience(observation=3, action=3, reward=3)],
        [Experience(observation=4, action=4, reward=4)],
        [Experience(observation=5, action=5, reward=5)],
        [Experience(observation=6, action=6, reward=6)],
        [Experience(observation=7, action=7, reward=7)]],
       [[Experience(observation=5, action=5, reward=5)],
        [Experience(observation=6, action=6, reward=6)],
        [Experience(observation=7, action=7, reward=7)],
        [Experience(observation=8, action=8, reward=8)],
        [Experience(observation=9, action=9, reward=9)]]])

### Good, but never across episode boundries :)

### Good, but now more efficient code! :)

## Although this is how it is described in R2D2, for me that doesnt make any sense, because we would need to store duplicates!

Just store $(s,a,r)$ normally and generate sequences, when sampling 