In [None]:
!pip install gymnasium -qU
!pip install wandb -qU

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
from collections import deque
import matplotlib.pyplot as plt
import uuid
import wandb

  File "/usr/local/lib/python3.11/dist-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/usr/local/lib/python3.11/dist-packages/ale_py/__init__.py", line 68, in <module>
    register_v0_v4_envs()
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 178, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 63, in _register_rom_configs
    gymnasium.register(
    ^^^^^^^^^^^^^^^^^^
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {trace

In the programming assignment 2, we are required to implement two alogrithms, `Monte Carlo - REINFORCE` and `Dueling Deep Q-learning Network (Dueling DQN)`. After wards, the goal is to run hyperparameter search to find the best hyperparameters for both algorithms on two environments, `CartPole-v1` and `Acrobot-v2`. 

In this jupyter notebook let's implement the `Monte Carlo - REINFORCE` algorithm. 

In [3]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims):
        """
        Network with configurable depth and layer sizes

        Args:
            input_dim: Dimension of input state
            output_dim: Dimension of output action space
            hidden_dims: List of hidden layer dimensions
        """
        super(PolicyNetwork, self).__init__()

        layers = []
        prev_dim = input_dim

        # Build network with arbitrary depth based on hidden_dims list
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim

        # Output layer
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Softmax(dim=-1))

        self.network = nn.Sequential(*layers)

    def forward(self, state):
        action_probs = self.network(state)
        return torch.distributions.Categorical(probs=action_probs)

In [None]:
class ValueNetwork(nn.Module):
    """
    Baseline Value Network for estimating V(s; Φ).
    Used in the 'with baseline' version of MC-REINFORCE.
    """
    def __init__(self, input_dim, hidden_dims):
        """
        Network with configurable depth and layer sizes

        Args:
            input_dim: Dimension of input state
            hidden_dims: List of hidden layer dimensions
        """
        super(ValueNetwork, self).__init__()

        layers = []
        prev_dim = input_dim

        # Build network with arbitrary depth based on hidden_dims list
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim

        # Output layer (single value)
        layers.append(nn.Linear(prev_dim, 1))

        self.network = nn.Sequential(*layers)

    def forward(self, state):
        return self.network(state)

In [None]:
class REINFORCEAgent:
    def __init__(self, env, hyperparams):
        self.env = env
        self.hp = hyperparams

        # Initialize policy network
        # Initialize policy network
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        # Create hidden layers configuration based on parameters
        if 'layer_sizes' in self.hp:
            # Use the list of layer sizes directly
            hidden_dims = self.hp['layer_sizes']
        elif 'network_depth' in self.hp and 'layer_size' in self.hp:
            # Backward compatibility - use same size for all layers
            hidden_dims = [self.hp['layer_size']] * self.hp['network_depth']
        else:
            # Fallback to original implementation
            hidden_dims = [self.hp['hidden_dim']]


        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dims)
        self.optimizer_policy = optim.Adam(self.policy_net.parameters(), lr=self.hp['lr'])

        # Setup learning rate decay scheduler if configured
        if 'lr_decay' in self.hp and self.hp['lr_decay'] < 1.0:
            self.scheduler_policy = optim.lr_scheduler.ExponentialLR(
                self.optimizer_policy, gamma=self.hp['lr_decay']
            )
        else:
            self.scheduler_policy = None

        # Initialize value network (only for 'with baseline' version)
        if self.hp['use_baseline']:
            self.value_net = ValueNetwork(state_dim, hidden_dims)
            self.optimizer_value = optim.Adam(self.value_net.parameters(), lr=self.hp['lr_value'])

            # Setup value network learning rate decay if configured
            if 'lr_decay' in self.hp and self.hp['lr_decay'] < 1.0:
                self.scheduler_value = optim.lr_scheduler.ExponentialLR(
                    self.optimizer_value, gamma=self.hp['lr_decay']
                )
            else:
                self.scheduler_value = None

        # Reward tracking
        self.reward_buffer = deque(maxlen=100)
        self.avg_reward_list = []
        self.episode_reward = []
    def get_action(self, state):
        state = torch.FloatTensor(np.array(state))
        dist = self.policy_net(state)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob

    def update_policy(self, rewards, log_probs, states=None):
        """
        Policy update method for both versions (with and without baseline).
        """
        returns = self._compute_returns(rewards)

        policy_loss = []

        if self.hp['use_baseline']:
            # Update baseline using TD(0) method
            for i in range(len(states)):
                state_value = self.value_net(states[i])
                td_error = returns[i] - state_value  # TD error
                value_loss = td_error.pow(2).mean()  # Mean squared error
                self.optimizer_value.zero_grad()
                value_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 1.0)
                self.optimizer_value.step()

            # Subtract baseline (state value) from returns
            baselines = torch.cat([self.value_net(s).detach() for s in states])
            returns -= baselines

        # Compute policy loss
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)  # Negative for gradient ascent

        policy_loss = torch.stack(policy_loss).sum()

        # Update policy network
        self.optimizer_policy.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.optimizer_policy.step()

    def _compute_returns(self, rewards):
        """
        Compute discounted returns G_t.
        """
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + self.hp['gamma'] * G
            returns.insert(0, G)

        returns = torch.FloatTensor(returns)

        if not self.hp['use_baseline'] and len(returns) > 1:  # Normalize only if no baseline is used
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        return returns

    def train(self , wandb):
        flag = True
        for episode in range(self.hp['n_episodes']):
            state, _ = self.env.reset()
            log_probs = []
            rewards = []

            states = [] if self.hp['use_baseline'] else None

            done = False
            while not done:
                action, log_prob = self.get_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)

                log_probs.append(log_prob)
                rewards.append(reward)

                if states is not None:
                    states.append(torch.FloatTensor(np.array(state)))

                state = next_state
                done = terminated or truncated

                if done:
                    self.reward_buffer.append(sum(rewards))
                    self.episode_reward.append(sum(rewards))
                    # print("REWARDS")
                    # print(rewards)
                    # mean_episode_reward = sum(rewards)/len(rewards)
                    # print(mean_episode_reward)
                    # wandb.log({"episodic_reward": mean_episode_reward})
                    break

            # Update policy (and baseline if applicable)
            if flag:
                self.update_policy(rewards, log_probs, states if states is not None else None)
    
                # Apply learning rate decay if configured
                if self.scheduler_policy is not None:
                    self.scheduler_policy.step()
                if self.hp['use_baseline'] and self.scheduler_value is not None:
                    self.scheduler_value.step()

            # Logging
            mean_episode_reward = sum(rewards)
            wandb.log({"episodic_reward": mean_episode_reward})
            if (episode + 1) % 10 == 0:
                avg_reward = np.mean(self.reward_buffer)
                self.avg_reward_list.append(avg_reward)
                
                print(f"Episode {episode+1}: Average Reward (last 100) = {avg_reward:.2f}")
                wandb.log({"mean_score_over_last_100_episodes": avg_reward})

                if avg_reward >= self.env.spec.reward_threshold:
                    print(f"Solved in {episode+1} episodes!")
                    flag = False
                else:
                    print("Reinitializing Training")
                    flag = True

        return self.episode_reward

We have done the implemnation of the `REINFORCEAgent` above. Let's run the hyperparameter search to find the best hyperparameter for both the environments. In order to run hyperparameter search, we will be using `wandb`. 

In [6]:
def create_config_group_name(config):
    return (
        f"{config.env_name}_"
        f"{config.experiment}_"
        f"BL{config.baseline}_"
        f"neps{config.n_episodes}_"
        f"lr{config.lr}_"
        f"lrv{config.lr_value}_"
        f"lsz{config.layer_size}_"
        f"lyd{config.lr_decay}_"
        f"gm{config.gamma}_")

In [None]:
# Use this code to login to wandb if you are using it locally ensure that you have logined to wandb.
#Don't run this code block if you are not using kaggle to execute the code.

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api = user_secrets.get_secret("wandb_api")

wandb.login(key = api)

In [None]:
def train_wandb(config=None):
    run = wandb.init(
        project= "DA6400 Assignment 2",
        config = config
    )

    # adding an attribute to group various runs with the same configuration
    config = wandb.config
    config_group = create_config_group_name(config)
    wandb.config.update({"config_group": config_group}, allow_val_change=True)

    wandb.run.name = name=f"{config_group}_run_{uuid.uuid4().hex[:4]}"

    # initalise environment - passed as config parameter
    env = gym.make(config.env_name)
    HYPERPARAMS= {
    'n_episodes': config.n_episodes,
    'gamma': config.gamma,
    'lr': config.lr,
    'lr_value': config.lr_value,
    'use_baseline': config.baseline,  # Enable baseline version
    'layer_sizes':config.layer_size,  # Different sizes for each layer
    'lr_decay': config.lr_decay      # Learning rate decay factor
}
    agent  = REINFORCEAgent(env, HYPERPARAMS)
    rewards = agent.train(wandb = run )
    run.log( {"mean_total_reward": sum(rewards)/len(rewards)})
    

We have define the one specific sweep config for one of the environment. we will modify below code to run the sweep for all the environments.

In [None]:
sweep_config = {
    "method":"grid",
    "name" : "Hyperparmeter training with Cartpole, MCReinforce",
    "metric": {
        "goal":"maximize",
        "name":"mean_total_reward"
    },
    "parameters":{
        "env_name":{
            "value":"CartPole-v1"
        },
        "experiment":{
            "value":"MCReinforce"
        },
        "baseline":{
            "values":[True, False]
        },
        "n_episodes":{
            "value":2000
        },
        "layer_size":{
            "values":[[128],[64],[64,64],[32,32]]
        },
        "gamma":{
            "values":[0.99]
        },
        "lr":{
            "values":[0.001, 0.0001 , 0.01]
        },
        "lr_value":{
            "values": [0.001]
        },
        "lr_decay":{
            "values":[1, 0.995 , 0.9995]
        }
        
       
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="DA6400 Assignment 2")

In [None]:
for i in range(5):
    wandb.agent(sweep_id, train_wandb)

Note: Run the above 3 cell repeatly for all your hyperparameter combinations. The code will run the training for each combination of hyperparameters and log the results to wandb. You can then visualize the results in the wandb dashboard.

Once We have the best hyperparameters, we can run the experiments to genearte required visualisation in `wandb` dashboard. 