# Code

In [12]:
"""
Script that contains the details about the experience replay buffer used in DDQN to ensure training stability
"""

'\nScript that contains the details about the experience replay buffer used in DDQN to ensure training stability\n'

Import the necessary libraries.

In [13]:
import random
import numpy as np

import torch

Read the variables from the configuration file.

In [14]:
from configparser import ConfigParser
  
configur = ConfigParser()
import builtins
configur.read(builtins.current_filename)

# configur.read('config.ini')

['results/config.ini']

self.capacity is the capacity of the replay buffer.
The replay buffer has a circular memory.
- self.buffer_state = [] # Stores all the current states
- self.buffer_action = [] # Stores all the current actions
- self.buffer_next_state = [] # Stores all the next actions
- self.buffer_reward = [] # Stores all the rewards
- self.idx = 0 # This is the position where the next experience will be stored.

In [7]:
class ReplayMemory():
    """
    Class representing the replay buffer used for storing experiences for off-policy learning
    """
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer_state = []
        self.buffer_action = []
        self.buffer_next_state = []
        self.buffer_reward = []
        # self.buffer_done = []
        self.idx = 0

Stores the current experience in the replay buffer.

In [8]:
class ReplayMemory(ReplayMemory):
    def store(self, state, action, next_state, reward):
        """
        Function to add the provided experience to the memory, such that transition is a 5-tuple of the form (state, action, next_state, reward, done)

        Parameters
        ---
        state: numpy.ndarray
            Current state vector observed in the environment
        action: int
            Action performed by the agent in the current state
        next_state: numpy.ndarray
            State vector observed as a result of performing the action in the current state
        reward: float
            Reward obtained by the agent
        done: bool
            Indicates whether the agent has entered a terminal state or not

        Returns
        ---
        none
        """

        if len(self.buffer_state) < self.capacity:
            self.buffer_state.append(state)
            self.buffer_action.append(action)
            self.buffer_next_state.append(next_state)
            self.buffer_reward.append(reward)
            # self.buffer_done.append(done)
        else:
            self.buffer_state[self.idx] = state
            self.buffer_action[self.idx] = action
            self.buffer_next_state[self.idx] = next_state
            self.buffer_reward[self.idx] = reward
            # self.buffer_done[self.idx] = done

        self.idx = (self.idx+1)%self.capacity # for circular memory

Return Randomly choose batchsize number of indices. \
Make the state into a numpy ndarray, and then sample the given indices, convert it back to a tensor.
return states, actions, next_states, rewards

In [9]:
class ReplayMemory(ReplayMemory):
    def sample(self, batch_size, device):
        """
        Function to pick 'n' samples from the memory that are selected uniformly at random, such that n = batchsize

        Parameters
        ---
        batchsize: int
            Number of elements to randomly sample from the memory in each batch
        device: str
            Name of the device (cuda or cpu) on which the computations would be performed

        Returns
        ---
        Tensors representing a batch of transitions sampled from the memory
        """
        if batch_size <= len(self.buffer_state):
            indices_to_sample = random.sample(range(len(self.buffer_state)), batch_size)
        else:
            indices_to_sample = random.choices(range(len(self.buffer_state)), k=batch_size)
            

        states = torch.from_numpy(np.array(self.buffer_state)[indices_to_sample]).float().to(device)
        actions = torch.from_numpy(np.array(self.buffer_action)[indices_to_sample]).to(device)
        next_states = torch.from_numpy(np.array(self.buffer_next_state)[indices_to_sample]).float().to(device)
        rewards = torch.from_numpy(np.array(self.buffer_reward)[indices_to_sample]).float().to(device)
        # a = np.array(self.buffer_reward)[indices_to_sample]
        # a=np.vstack(a).astype(np.float)
        # rewards = torch.from_numpy(a).float().to(device)
        # dones = torch.from_numpy(np.array(self.buffer_done)[indices_to_sample]).to(device)

        return states, actions, next_states, rewards
 

In [10]:
'''
import random
sampled_indices = random.choices(range(10), k=5)
sampled_indices
'''

'\nimport random\nsampled_indices = random.choices(range(10), k=5)\nsampled_indices\n'

Returns the size of the replay buffer

In [18]:
       
class ReplayMemory(ReplayMemory):
    def __len__(self):
        """
        Function that specifies the number of elements persent in the replay memory


        Parameters
        ---
        none

        Returns
        ---
        int
            number of currently stored elements in the replay buffer
        """
        return len(self.buffer_state)