In [None]:
%run src/DQN/dqn_net.ipynb
%run src/DQN/replay_memory.ipynb

# Code

In [None]:
"""
Script that contains details how the DQN agent learns, updates the target network, selects actions and save/loads the model
"""

Import libraries

In [35]:
import random
import numpy as np

import torch
import torch.nn.functional as F

Read the configurations from the signs

In [36]:
from configparser import ConfigParser
  
configur = ConfigParser()
import builtins
configur.read(builtins.current_filename)

# configur.read('config.ini')

['results/config.ini']

Store the values of the parameters. Don't know what they mean

In [38]:
# discount gamma for the return function.
discount1 = float(configur.get('architecture','discount'))
# starting value of the epsilon
eps_max1= float(configur.get('architecture','eps_max'))
# minimum value of the epsilon
eps_min1= float(configur.get('architecture','eps_min'))
# decay parameter for epsilon
eps_decay1 = float(configur.get('architecture','eps_decay'))
# Memory capacity of the buffer
memory_capacity1 = int(configur.get('architecture','memory_capacity'))
# learning rate
lr1= float(configur.get('architecture','lr')) 

Initialize the device. \
Initialize the epsilon for the epsilon greedy strategy. \
Initialize the discount, gamma for the return function. \
Initialize the state size and action size. \
Initialize the policy network and the target(value) network.\
Initialize the experience buffer.

In [40]:
class DQNAgent():
    """
    Class that defines the functions required for training the DQN agent
    """
    def __init__(self, device, state_size, action_size, 
                    discount=discount1, 
                    eps_max=eps_max1, 
                    eps_min=eps_min1, 
                    eps_decay=eps_decay1, 
                    memory_capacity=memory_capacity1, 
                    lr=lr1, 
                    train_mode=True):

        self.device = device

        # for epsilon-greedy exploration strategy
        self.epsilon = eps_max
        self.epsilon_min = eps_min
        self.epsilon_decay = eps_decay

        # for defining how far-sighted or myopic the agent should be
        self.discount = discount

        # size of the state vectors and number of possible actions
        self.state_size = state_size
        self.action_size = action_size

        torch_seed = 0
        random_seed = 0
        cuda_seed = 0
        
        random.seed(random_seed)
        torch.manual_seed(torch_seed)
        torch.cuda.manual_seed(cuda_seed)

        # instances of the network for current policy and its target
        self.policy_net = DQNNet(self.state_size, self.action_size, lr).to(self.device)
        self.target_net = DQNNet(self.state_size, self.action_size, lr).to(self.device)
        self.target_net.eval() # since no learning is performed on the target net
        if not train_mode:
            self.policy_net.eval()

        # instance of the replay buffer
        self.memory = ReplayMemory(capacity=memory_capacity)

Make the action selection process purely greedy.

In [None]:
class DQNAgent(DQNAgent):    
    def turn_off_exploration(self):
        self.epsilon =  0

Update the value or taget DQN with the policy DQN parameters.

In [41]:
class DQNAgent(DQNAgent):    
    def updateTargetNet(self):
        """
        Function to copy the weights of the current policy net into the (frozen) target net

        Parameterse
        ---
        none

        Returns
        ---
        none
        """
        self.target_net.load_state_dict(self.policy_net.state_dict())

Update the value of the epsilon function

In [None]:
class DQNAgent(DQNAgent):    
    def updateEpsilon(self):
        """
        Function for reducing the epsilon value (used for epsilon-greedy exploration with annealing)

        Parameters
        ---
        none

        Returns
        ---
        none
        """
        
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

Select the action in an epsilon greedy manner. \
If the value is less than epsilon, randomly return an integer representing an action. \
Else make the current state as a tensor and feed it to the policy network. \
Choose the argmax of the result of the policy function.

In [42]:
class DQNAgent(DQNAgent):    
    def selectAction(self, state):
        """
        Uses epsilon-greedy exploration such that, if the randomly generated number is less than epsilon then the agent performs random action, else the agent executes the action suggested by the policy Q-network
        """
        """
        Function to return the appropriate action for the given state.
        During training, returns a randomly sampled action or a greedy action (predicted by the policy network), based on the epsilon value.
        During testing, returns action predicted by the policy network

        Parameters
        ---
        state: vector or tensor
            The current state of the environment as observed by the agent

        Returns
        ---
        none
        """

        if random.random() < self.epsilon: # amount of exploration reduces with the epsilon value
            return random.randrange(self.action_size)

        if not torch.is_tensor(state):
            state = torch.tensor([state], dtype=torch.float32).to(self.device)

        # pick the action with maximum Q-value as per the policy Q-network
        with torch.no_grad():
            action = self.policy_net.forward(state)
        return torch.argmax(action).item() # since actions are discrete, return index that has highest Q

Sample experience from the replay buffer of size batchsize. \
Use the policy DQN to predict the possible_actions.\
Calculate the actions for each experience selected by the policy DQN. \
Then calculate the rewards of the actions of all the experiences by target or value DQN. Use max on this value which will return the maximum values and indices for each experience and then choose only the values. \
Then discount the Q values of the target DQN by gamma, and add it to the rewards. \ 
Make the loss function to minimise the loss between q_pred, and the return y_j. \
Then update the policy network and return the MSE loss. \

In [None]:
class DQNAgent(DQNAgent):    
    def learn(self, batchsize):
        """
        Function to perform the updates on the neural network that runs the DQN algorithm.

        Parameters
        ---
        batchsize: int
            Number of experiences to be randomly sampled from the memory for the agent to learn from

        Returns
        ---
        none
        """

        # select n samples picked uniformly at random from the experience replay memory, such that n=batchsize
        if len(self.memory) < batchsize:
            return
        states, actions, next_states, rewards = self.memory.sample(batchsize, self.device)

        # get q values of the actions that were taken, i.e calculate qpred; 
        # actions vector has to be explicitly reshaped to nx1-vector
        q_pred = self.policy_net.forward(states).gather(1, actions.view(-1, 1)) 
        
        #calculate target q-values, such that yj = rj + q(s', a'), but if current state is a terminal state, then yj = rj
        q_target = self.target_net.forward(next_states).max(dim=1).values # because max returns data structure with values and indices
        #q_target = self.policy_net.forward(next_states).max(dim=1).values # because max returns data structure with values and indices
        # q_target[dones] = 0.0 # setting Q(s',a') to 0 when the current state is a terminal state
        y_j = rewards + (self.discount * q_target)
        y_j = y_j.view(-1, 1)
        
        # calculate the loss as the mean-squared error of yj and qpred
        self.policy_net.optimizer.zero_grad()
        loss = F.mse_loss(y_j, q_pred).mean()
        loss.backward()
        self.policy_net.optimizer.step()
        
        # TODO: get loss from current values of q_pred and y_j
        return float(loss)

Save the policy model.

In [None]:
class DQNAgent(DQNAgent):    
    def saveModel(self, filename):
        """
        Function to save the policy network

        Parameters
        ---
        filename: str
            Location of the file where the model is to be saved        

        Returns
        ---
        none
        """

        self.policy_net.saveModel(filename)

Load the policy network from the file.

In [None]:
class DQNAgent(DQNAgent):        
    def loadModel(self, filename):
        """
        Function to load model parameters

        Parameters
        ---
        filename: str
            Location of the file from where the model is to be loaded

        Returns
        ---
        none
        """

        self.policy_net.loadModel(filename=filename, device=self.device)

Make the state in the tensor. \
Obtain the q_values from the target DQN. \
Find the maximum q_value and return it.

In [None]:
class DQNAgent(DQNAgent):        
    def getQValue(self, state):
        """
        Function to return the Q-value for the given state(currently returning the MAX)

        Parameters
        ---
        state: vector or tensor
            The current state of the environment as observed by the agent

        Returns
        ---
        q_values: tensor
            The Q-values for the given state
        """

        if not torch.is_tensor(state):
            state = torch.tensor([state], dtype=torch.float32).to(self.device)

        with torch.no_grad():
            # TODO policy_net or target_net?
            q_values = self.target_net.forward(state)

        max_q_value = float(torch.max(q_values, dim=1)[0])

        return max_q_value