In [None]:
!pip install gymnasium -qU
!pip install wandb -qU

In [None]:
import gymnasium as gym
import torch
from torch import nn, optim
from torch.nn import functional as F
import random
import numpy as np
from collections import namedtuple, deque
from scipy.special import softmax
import matplotlib.pyplot as plt
import wandb
from tqdm.notebook import tqdm
import uuid

In the programming assignment 2, we are required to implement two alogrithms, `Monte Carlo - REINFORCE` and `Dueling Deep Q-learning Network (Dueling DQN)`. After wards, the goal is to run hyperparameter search to find the best hyperparameters for both algorithms on two environments, `CartPole-v1` and `Acrobot-v2`. 

In this jupyter notebook let's implement the `Dueling DQN` algorithm. 

In [None]:
# Use this code to login to wandb if you are using it locally ensure that you have logined to wandb.
#Don't run this code block if you are not using kaggle to execute the code.

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api = user_secrets.get_secret("wandb_api")

wandb.login(key = api)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
class DuelingQNetwork(nn.Module):
    def __init__(self,state_space_size,action_space_size,num_layer,layer_size,value_fn_layer_size,advantage_fn_layer_size,algo_type='Type1'):

        super().__init__()
        self.algo_type = algo_type
        if isinstance(layer_size,int):
          layer_size = [layer_size] * num_layer

        layers = []
        layers.append(nn.Linear(state_space_size, layer_size[0]))
        layers.append(nn.ReLU())

        for i in range(1,num_layer):
          layers.append(nn.Linear(layer_size[i-1],layer_size[i]))
          layers.append(nn.ReLU())

        self.state_approximator = nn.Sequential(
            *layers
        )

        self.value_fn_network = nn.Sequential(
            nn.Linear(layer_size[-1],value_fn_layer_size),
            nn.ReLU(),
            nn.Linear(value_fn_layer_size,1)
        )

        self.advantage_fn_network = nn.Sequential(
            nn.Linear(layer_size[-1],advantage_fn_layer_size),
            nn.ReLU(),
            nn.Linear(advantage_fn_layer_size,action_space_size)
        )

    def forward(self,state_obs):
        if len(state_obs.shape) == 1:
          state_obs = state_obs.unsqueeze(0)

        state_approximator = self.state_approximator(state_obs)
        value_fn = self.value_fn_network(state_approximator)
        advantage_fn = self.advantage_fn_network(state_approximator)

        if self.algo_type == 'Type1':
            q_value = value_fn + (advantage_fn - torch.mean(advantage_fn,dim = 1,keepdim=True))
        elif self.algo_type =='Type2':
            q_value = value_fn+(advantage_fn-torch.max(advantage_fn,dim=1,keepdim=True).values)

        return q_value

In [None]:
class ReplayBuffer:
  def __init__(self,batch_size,buffer_size,seed):
    self.seed = random.seed(seed)
    self.batch_size = batch_size
    self.memory = deque(maxlen=buffer_size)
    self.experience = namedtuple("Experience",field_names=['state','action','reward','next_state','terminated','truncated'])

  def add(self,state,action,reward,next_state,terminated,truncated):
    experience = self.experience(state,action,reward,next_state,terminated,truncated)
    self.memory.append(experience)

  def sample(self):
    experiences = random.sample(self.memory,self.batch_size)
    states = torch.from_numpy(np.vstack([e.state for e in experiences])).float().to(device)
    action = torch.from_numpy(np.vstack([e.action for e in experiences])).long().to(device)
    reward = torch.from_numpy(np.vstack([e.reward for e in experiences])).float().to(device)
    next_state = torch.from_numpy(np.vstack([e.next_state for e in experiences])).float().to(device)
    terminated = torch.from_numpy(np.vstack([e.terminated for e in experiences]).astype(np.uint8)).float().to(device)
    truncated = torch.from_numpy(np.vstack([e.truncated for e in experiences]).astype(np.uint8)).float().to(device)

    return (states,action,reward,next_state,terminated,truncated)

  def __len__(self):
    return len(self.memory)

In [None]:
class SoftmaxPolicy:
  def __init__(self,tau,tau_decay, min_tau):
    self.tau = tau
    self.tau_decay = tau_decay
    self.min_tau = min_tau

  def decay_policy_param(self):
    self.tau = max(self.min_tau,self.tau * self.tau_decay)

  def get_action(self, q_values):
    q_values = q_values.detach().cpu().numpy().ravel()
    logits = q_values - np.max(q_values)
    prob = softmax(logits/self.tau)
    action = np.random.choice(np.arange(len(q_values)),p=prob)
    return action

In [None]:
class EpsilonGreedyPolicy:
  def __init__(self,epsilon,epsilon_decay,min_epsilon):
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.min_epsilon = min_epsilon

  def decay_policy_param(self):
    self.epsilon = max(self.min_epsilon,self.epsilon * self.epsilon_decay)

  def get_action(self,q_values):
    q_values = q_values.detach().cpu().numpy()
    if random.random() < self.epsilon:
      action = np.random.choice(np.arange(len(q_values)))
    else:
      action = np.argmax(q_values)
    return action

We have done the implemnation of the `Dueling DQN` above. Let's run the hyperparameter search to find the best hyperparameter for both the environments. In order to run hyperparameter search, we will be using `wandb`. 

In [None]:
def create_config_group_name(config):
    return (
        f"{config.env_name}_"
        f"{config.experiment}_"
        f"{config.type}_"
        f"nl{config.num_layer}_"
        f"ls{config.layer_size}_"
        f"vf{config.value_fn_layer_size}_"
        f"af{config.advantage_fn_layer_size}_"
        f"freq{config.target_network_replacement_freq}_"
        f"lr{config.learning_rate:.0e}_"
        f"{config.exploration_policy}_"
        f"decay{config.param_decay:.4f}_"
        f"p{config.policy_param}_"
        f"pmin{config.policy_param_min_value}_"
        f"batch{config.batch_size}_"
        f"gamma{config.gamma}"
    )


In [None]:
def train(config=None):
    max_episodes = 1000
    buffer_size = 10000
    seed = 200
    
    wandb.init(
        project= "DA6400 Assignment 2",
        config = config
    )

    # adding an attribute to group various runs with the same configuration
    config = wandb.config
    config_group = create_config_group_name(config)
    wandb.config.update({"config_group": config_group}, allow_val_change=True)

    wandb.run.name = name=f"{config_group}_run_{uuid.uuid4().hex[:4]}"

    # initalise environment - passed as config parameter
    env = gym.make(config.env_name)
    state_space_size = env.observation_space.shape[0]
    action_space_size = env.action_space.n
    
    #initalise replay buffer, local network, etc
    memory = ReplayBuffer(config.batch_size,buffer_size,seed)
    local_network = DuelingQNetwork(state_space_size,
                                   action_space_size,
                                   config.num_layer,
                                   config.layer_size,
                                   config.value_fn_layer_size,
                                   config.advantage_fn_layer_size,
                                   config.type).to(device)
    target_network = DuelingQNetwork(state_space_size,
                                   action_space_size,
                                   config.num_layer,
                                   config.layer_size,
                                   config.value_fn_layer_size,
                                   config.advantage_fn_layer_size,
                                   config.type).to(device)
    optimizer = optim.Adam(local_network.parameters(),lr = config.learning_rate)


    #based on the config, initial the policy for action selection
    if config.exploration_policy == 'EpsilonGreedy':
        policy = EpsilonGreedyPolicy(config.policy_param,
                                    config.param_decay,
                                    config.policy_param_min_value)
    elif config.exploration_policy == 'Softmax':
        policy = SoftmaxPolicy(config.policy_param,
                                    config.param_decay,
                                    config.policy_param_min_value)
    else:
        raise ValueError("Policy not implemented")

    #Let's start the training and sampling
    # we will use sample of replay buffer to train the network

    scores_window = deque(maxlen = 100)
    step_counter = 0
    total_mean_reward = 0 # summary statistic that will be maximized instead of minimizing regret
    mean_scores = 0
    for episode in tqdm(range(max_episodes)):
        state, _ = env.reset()
        done = False
        episodic_reward = 0
        while not done:
            local_network.eval()
            with torch.no_grad():
                q_values = local_network(torch.from_numpy(state).to(device))
            action = policy.get_action(q_values)
            local_network.train()

            next_state, reward,terminated,truncated, _ = env.step(action)
            memory.add(state,action,reward,next_state,terminated,truncated)
            done = terminated or truncated


            state = next_state
            episodic_reward += reward

            if len(memory) > config.batch_size and mean_scores < env.spec.reward_threshold:
                #training steps
                states,actions,rewards,next_states,terminated,truncated = memory.sample()
                q_next = target_network(next_states).detach().max(1)[0].unsqueeze(1)
                q_targets = rewards + (1-terminated) * config.gamma * q_next

                q_expected = local_network(states).gather(1,actions)
                loss = F.mse_loss(q_expected,q_targets)

                optimizer.zero_grad()
                loss.backward()

                for param in local_network.parameters():
                    param.grad.data.clamp_(-1,1)

                optimizer.step()

            step_counter = (step_counter + 1) % config.target_network_replacement_freq
            if step_counter == 0:
                target_network.load_state_dict(local_network.state_dict())
        
        policy.decay_policy_param()
        scores_window.append(episodic_reward)

        mean_scores = np.mean(scores_window)
        wandb.log({
            "mean_score_over_last_100_episodes": mean_scores,
            "episodic_reward": episodic_reward
        }) 

        total_mean_reward += (episodic_reward - total_mean_reward) / (episode + 1)

        # if mean_scores >= env.spec.reward_threshold:
        #     break

            
    wandb.log({
        'Total Mean Reward' : total_mean_reward
    })

    wandb.finish()
    

We have define the one specific sweep config for one of the environment. we will modify below code to run the sweep for all the environments.

In [None]:
sweep_config = {
    "name":"Hyperparameter tuning for type 2 D2QN, cartpole",
    "method":"grid",
    "metric": {
        "goal":"maximize",
        "name":"mean_total_reward"
    },
    "parameters":{
        "env_name":{
            "value":"CartPole-v1"
        },
        "experiment":{
            "value":"DuelingDQN"
        },
        "type":{
            "value":"Type2"
        },
        "num_layer":{
            "value":2
        },
        "layer_size":{
            "value":[128,64]
        },
        "value_fn_layer_size":{
            "value":32
        },
        "advantage_fn_layer_size":{
            "value":32
        },
        "target_network_replacement_freq":{
            "values": [10,20,50]
        },
        "learning_rate":{
            "values":[0.01,0.001,0.05]
        },
        "gamma":{
            "value": 0.998
        },
        "batch_size":{
            "values": [64,128]
        },
        "exploration_policy":{
            "values": ['Softmax','EpsilonGreedy']
        },
        "param_decay":{
            "values":[0.995,0.9995]
        },
        "policy_param":{
            "value": 1
        },
        "policy_param_min_value":{
            "value": 0.01
        }
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="DA6400 Assignment 2")


In [None]:
wandb.agent(sweep_id, train)

Note: Run the above 3 cell repeatly for all your hyperparameter combinations. The code will run the training for each combination of hyperparameters and log the results to wandb. You can then visualize the results in the wandb dashboard.

Once We have the best hyperparameters, we can run the experiments to genearte required visualisation in `wandb` dashboard. 

In order to generate the plots, we can use the below code, define a dictionary with the hyperparameters and use that to run the experiment. 

In [None]:
best_hyperparamters = {
        "env_name":"Acrobot-v1",
        "experiment":"DuelingDQN",
        "type":"Type2",
        "num_layer":2,
        "layer_size":[128,64],
        "value_fn_layer_size":32,
        "advantage_fn_layer_size":32,
        "target_network_replacement_freq": 50,
        "learning_rate":0.001,
        "gamma": 0.998,
        "batch_size":128,
        "exploration_policy": 'Softmax',
        "param_decay":0.995,
        "policy_param": 1,
        "policy_param_min_value": 0.01,
}


In [None]:
for i in range(5):
    train(best_hyperparamters)