---
---
# NEURAL NETWORKS AND DEEP LEARNING

### A.A. 2021/22 (6 CFU) - Dr. Alberto Testolin, Dr. Umberto Michieli

### Saverio Monaco
##### MAT: 2012264

# Homework 3 - Deep Reinforcement Learning
---
---

In [1]:
#################
#### IMPORTS ####
#################

# Arrays
import numpy as np
from collections import deque # fixed size FIFO list

# Deep Learning Stuff
import torch
from torch import nn
import gym

# Visualizing
import matplotlib.pyplot as plt
from gym.wrappers import Monitor
from IPython.display import HTML
from IPython import display as ipythondisplay
import base64

# Other
import random
from tqdm.notebook import tqdm
import io
import os
import glob


In [2]:
# Set random seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)


In [3]:
# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")


Training device: cuda


In [4]:
#################
#### CLASSES ####
#################

class ReplayMemory(object):
    '''
    To perform experience replay.
    We will draw uniformly at random from the pool of stored sample to learn.
    Thi avoids (temporal) correlation between consecutive learning instances.
    '''
    def __init__(self, capacity):
        ''' Initialize a deque with maximum capacity maxlen. '''
        self.memory = deque(maxlen=capacity) # Define a queue with maxlen "capacity"

    def push(self, state, action, next_state, reward):
        ''' Add a new sample to the deque, removes the oldest one if it is already full. '''
        # Add the tuple (state, action, next_state, reward) to the queue
        self.memory.append( (state, action, next_state, reward) )
        
    def sample(self, batch_size):
        ''' Randomly select "batch_size" samples '''
        batch_size = min(batch_size, len(self)) # Get all the samples if the requested batch_size is higher than the number of sample currently in the memory
        return random.sample(self.memory, batch_size)

    def __len__(self):
        ''' Return the number of samples currently stored in the memory '''
        return len(self.memory)

class DQN(nn.Module):
    ''' 
    Network for policy network and target network 
    state_space_dim:  (INPUT)  dimension of state space (e.g pixels in a image)
    action_space_dim: (OUTPUT) dimension of action space (e.g go left, go right)
    '''
    def __init__(self, DQN_state_space_dim, DQN_action_space_dim):
        super().__init__()
            
        self.sdim = DQN_state_space_dim
        self.adim = DQN_action_space_dim
            
        self.linear = nn.Sequential(
            nn.Linear(self.sdim, 128),
            nn.Tanh(),
            nn.Linear(128,128),
            nn.Tanh(),
            nn.Linear(128,self.adim)
                )

    def forward(self, x):
        return self.linear(x)
    
#
#          |---------> [Prediction Network (DQN)]--------
#          |                |                            \
# [INPUT]--|                | Parameter update            \___Loss
#          |               \/                            /
#          |---------> [Target Network (DQN)]------------
#
class FullQNets(nn.Module):
    ''' 
    Handles all the networks, environments, and others
    '''
    def __init__(self, envname):
        super().__init__()
        
        self.envname = envname
        
        tempenv = gym.make(self.envname) 
        self.state_space_dim = tempenv.observation_space.shape[0]
        self.action_space_dim = tempenv.action_space.n
        
        self.policy_net = DQN(self.state_space_dim, self.action_space_dim)
        self.target_net = DQN(self.state_space_dim, self.action_space_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
    
    def choose_action_epsilon_greedy(self, state, epsilon):
        self.policy_net.eval()
        if epsilon > 1 or epsilon < 0:
            raise Exception('The epsilon value must be between 0 and 1')
                
        # Evaluate the network output from the current state
        with torch.no_grad():
            self.policy_net.eval()
            state = torch.tensor(state, dtype=torch.float32, device=device) # Convert the state to tensor
            net_out = self.policy_net(state)

        # Get the best action (argmax of the network output)
        best_action = int(net_out.argmax())
        # Get the number of possible actions
        action_space_dim = net_out.shape[-1]

        # Select a non optimal action with probability epsilon, otherwise choose the best action
        if random.random() < epsilon:
            # List of non-optimal actions
            non_optimal_actions = [a for a in range(action_space_dim) if a != best_action]
            # Select randomly
            action = random.choice(non_optimal_actions)
        else:
            # Select best action
            action = best_action
        
        return action, net_out.cpu().numpy()
    
    def choose_action_softmax(self, state, temperature):
        self.policy_net.to(device)
        if temperature < 0:
            raise Exception('The temperature value must be greater than or equal to 0 ')
        
        # If the temperature is 0, just select the best action using the eps-greedy policy with epsilon = 0
        if temperature == 0:
            return self.choose_action_epsilon_greedy(state, 0)
    
        # Evaluate the network output from the current state
        with torch.no_grad():
            self.policy_net.eval()
            state = torch.tensor(state, dtype=torch.float32, device=device)
            net_out = self.policy_net(state)

        # Apply softmax with temp
        temperature = max(temperature, 1e-8) # set a minimum to the temperature for numerical stability
        softmax_out = nn.functional.softmax(net_out / temperature, dim=0).cpu().numpy()
                
        # Sample the action using softmax output as mass pdf
        all_possible_actions = np.arange(0, softmax_out.shape[-1])
        action = np.random.choice(all_possible_actions, p=softmax_out) # this samples a random element from "all_possible_actions" with the probability distribution p (softmax_out in this case)
    
        return action, net_out.cpu().numpy()
    
    def update_step(self, replay_mem, gamma, optimizer, loss_fn, batch_size):
        self.policy_net.to(device)
        self.target_net.to(device)
        # Sample the data from the replay memory
        batch = replay_mem.sample(batch_size)
        batch_size = len(batch)

        # Create tensors for each element of the batch
        states      = torch.tensor([s[0] for s in batch], dtype=torch.float32, device=device)
        actions     = torch.tensor([s[1] for s in batch], dtype=torch.int64, device=device)
        rewards     = torch.tensor([s[3] for s in batch], dtype=torch.float32, device=device)

        # Compute a mask of non-final states (all the elements where the next state is not None)
        non_final_next_states = torch.tensor([s[2] for s in batch if s[2] is not None], dtype=torch.float32, device=device) # the next state can be None if the game has ended
        non_final_mask = torch.tensor([s[2] is not None for s in batch], dtype=torch.bool, device=device)

        # Compute all the Q values (forward pass)
        self.policy_net.train()
        q_values = self.policy_net(states)
        # Select the proper Q value for the corresponding action taken Q(s_t, a)
        state_action_values = q_values.gather(1, actions.unsqueeze(1))

        # Compute the value function of the next states using the target network V(s_{t+1}) = max_a( Q_target(s_{t+1}, a)) )
        with torch.no_grad():
            self.target_net.eval()
            q_values_target = self.target_net(non_final_next_states)
        next_state_max_q_values = torch.zeros(batch_size, device=device)
        next_state_max_q_values[non_final_mask] = q_values_target.max(dim=1)[0]

        # Compute the expected Q values
        expected_state_action_values = rewards + (next_state_max_q_values * gamma)
        expected_state_action_values = expected_state_action_values.unsqueeze(1) # Set the required tensor shape

        # Compute the Huber loss
        loss = loss_fn(state_action_values, expected_state_action_values)

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        # Apply gradient clipping (clip all the gradients greater than 2 for training stability)
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), 2)
        optimizer.step()
        
        return optimizer
    
    def training_loop(self, max_epoch, loss_fn, render=False, initial_value=5, gamma = 0.97, 
                 replay_memory_capacity = 10000, lr = 1e-2, target_net_update_steps = 10, 
                 batch_size = 128, bad_state_penalty = 0, min_samples_for_training = 1000):
        '''
        PARAMETERS
        gamma: gamma parameter for the long term reward
        replay_memory_capacity: Replay memory capacity
        lr: Optimizer learning rate
        target_net_update_steps: Number of episodes to wait before updating the target network
        batch_size: Number of samples to take from the replay memory for each update
        bad_state_penalty: Penalty to the reward when we are in a bad state (in this case when the pole falls down) 
        min_samples_for_training: Minimum samples in the replay memory to enable the training
        '''
        env = gym.make(self.envname)
        optimizer = torch.optim.SGD(self.policy_net.parameters(), lr=lr)
        
        # We compute the exponential decay in such a way the shape of the exploration 
        # profile does not depend on the number of iterations
        exp_decay = np.exp(-np.log(initial_value) / max_epoch * 6) 
        exploration_profile = [initial_value * (exp_decay ** i) for i in range(max_epoch)]
        
        ### Initialize the replay memory
        replay_mem = ReplayMemory(replay_memory_capacity)   
        
        env.seed(0)
        for episode_num, tau in enumerate(tqdm(exploration_profile)):
            # Reset the environment and get the initial state
            state = env.reset()
            
            # Reset the score. The final score will be the total amount of steps before the pole falls
            score = 0
            
            done = False
            
            # Go on until the pole falls off
            while not done:
                # Choose the action following the policy
                action, q_values = self.choose_action_softmax(state, temperature=tau)
                
                # Apply the action and get the next state, the reward and a flag "done" 
                # that is True if the game is ended
                next_state, reward, done, info = env.step(action)
                
                # We apply a (linear) penalty when the cart is far from center
                pos_weight = 1
                reward = reward - pos_weight * np.abs(state[0])
                
                # Update the final score (+1 for each step)
                score += 1
                
                # Update the replay memory
                replay_mem.push(state, action, next_state, reward)

                # Update the network
                if len(replay_mem) > min_samples_for_training: # we enable the training only if we have enough samples in the replay memory, otherwise the training will use the same samples too often
                    optimizer = self.update_step(replay_mem, gamma, optimizer, loss_fn, batch_size)
                
                if render:
                    # Visually render the environment (disable to speed up the training)
                    env.render()

                # Set the current state for the next iteration
                state = next_state

            # Update the target network every target_net_update_steps episodes
            if episode_num % target_net_update_steps == 0:
                print('Updating target network...')
                self.target_net.load_state_dict(self.policy_net.state_dict()) # This will copy the weights of the policy network to the target network

            # Print the final score
            print(f"EPISODE: {episode_num + 1} - FINAL SCORE: {score} - Temperature: {tau}") # Print the final score

        env.close()
    
    def play_a_game(self):
        # Initialize the Gym environment
        env = gym.make(self.envname) 
        
        # Reset the environment and get the initial state
        state = env.reset()
        # Reset the score. The final score will be the total amount of steps before the pole falls
        score = 0
        done = False
        # Go on until the pole falls off or the score reach 490
        while not done:
            # Choose the best action (temperature 0)
            action, q_values = self.choose_action_softmax(state, temperature=0)
            # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
            next_state, reward, done, info = env.step(action)
            # Visually render the environment
            env.render()
            # Update the final score (+1 for each step)
            score += reward 
            # Set the current state for the next iteration
            state = next_state
            # Check if the episode ended (the pole fell down)
        # Print the final score
        print(f"SCORE: {score}") 
        env.close()

In [5]:
### Initialize the loss function (Huber loss)
loss_fn = nn.SmoothL1Loss()


In [6]:
a = FullQNets('CartPole-v1')

In [10]:
a.play_a_game()


SCORE: 165.0


In [11]:
a.training_loop(700, loss_fn, render=False, lr=.2)

  0%|          | 0/700 [00:00<?, ?it/s]

Updating target network...
EPISODE: 1 - FINAL SCORE: 12 - Temperature: 5.0
EPISODE: 2 - FINAL SCORE: 17 - Temperature: 4.931497676847884
EPISODE: 3 - FINAL SCORE: 41 - Temperature: 4.863933867351216
EPISODE: 4 - FINAL SCORE: 19 - Temperature: 4.797295713436853
EPISODE: 5 - FINAL SCORE: 24 - Temperature: 4.7315705331932305
EPISODE: 6 - FINAL SCORE: 16 - Temperature: 4.666745818456864
EPISODE: 7 - FINAL SCORE: 30 - Temperature: 4.6028092324319205
EPISODE: 8 - FINAL SCORE: 29 - Temperature: 4.539748607342402
EPISODE: 9 - FINAL SCORE: 15 - Temperature: 4.477551942116494
EPISODE: 10 - FINAL SCORE: 21 - Temperature: 4.416207400102644
Updating target network...
EPISODE: 11 - FINAL SCORE: 31 - Temperature: 4.355703306816926
EPISODE: 12 - FINAL SCORE: 59 - Temperature: 4.2960281477212625
EPISODE: 13 - FINAL SCORE: 29 - Temperature: 4.2371705660321055
EPISODE: 14 - FINAL SCORE: 26 - Temperature: 4.179119360559112
EPISODE: 15 - FINAL SCORE: 18 - Temperature: 4.121863483573455
EPISODE: 16 - FINAL 

  states      = torch.tensor([s[0] for s in batch], dtype=torch.float32, device=device)


EPISODE: 43 - FINAL SCORE: 43 - Temperature: 2.801178618966271
EPISODE: 44 - FINAL SCORE: 16 - Temperature: 2.7628011703736255
EPISODE: 45 - FINAL SCORE: 42 - Temperature: 2.7249495106580297
EPISODE: 46 - FINAL SCORE: 15 - Temperature: 2.6876164362675703
EPISODE: 47 - FINAL SCORE: 34 - Temperature: 2.650794842342343
EPISODE: 48 - FINAL SCORE: 15 - Temperature: 2.614477721362323
EPISODE: 49 - FINAL SCORE: 45 - Temperature: 2.5786581618137694
EPISODE: 50 - FINAL SCORE: 15 - Temperature: 2.543329346873888
Updating target network...
EPISODE: 51 - FINAL SCORE: 18 - Temperature: 2.508484553113525
EPISODE: 52 - FINAL SCORE: 8 - Temperature: 2.47411714921763
EPISODE: 53 - FINAL SCORE: 39 - Temperature: 2.4402205947232507
EPISODE: 54 - FINAL SCORE: 16 - Temperature: 2.4067884387748144
EPISODE: 55 - FINAL SCORE: 41 - Temperature: 2.3738143188964687
EPISODE: 56 - FINAL SCORE: 29 - Temperature: 2.3412919597812354
EPISODE: 57 - FINAL SCORE: 10 - Temperature: 2.3092151720967586
EPISODE: 58 - FINAL S

In [13]:
a.play_a_game()

SCORE: 500.0


---
## Resources:
* [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)