# CS6700: Reinforcement Learning
## Programming Assignment 1

Submitted by:
- Archish S (ME20B032)
- Vinayak Gupta (EE20B152)

# Monte Carlo REINFORCE

## Imports

In [1]:
import random
from dataclasses import dataclass
from collections import deque
from itertools import count

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import matplotlib.pyplot as plt
# plt.rcParams.update({
#     "text.usetex": True,
#     "font.family": "serif",
#     "font.serif": ["Computer Modern Roman"],
#     "font.size": 10
# })
%config InlineBackend.figure_format = 'retina'

import tqdm
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [2]:
@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    seed: int = 32 + 152

    # env: str = 'CartPole'
    env: str = 'Acrobot'
    algorithm: str = 'MCReinforce'
    type: str = ''

    gamma: float = 0.99
    epsilon_max: float = 1e-1
    epsilon_min: float = 1e-4
    lr: float = 1e-3
    num_runs: int = 5
    num_episodes: int = 10000
    max_steps: int = 1000
    threshold: int = 195

    batch_size: int = 32
    hidden_size: int = 64

## Networks

In [3]:
class PolicyNetwork(nn.Module):

    def __init__(self, state_size, action_size, hidden_size, args):
        super(PolicyNetwork, self).__init__()
        self.args = args
        
        self.fc = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        return self.fc(x)
    
    def select_action(self, state):
        with torch.no_grad():
            action = self.forward(state)
            action = F.softmax(action, dim=-1)
            m = Categorical(action)
            action = m.sample()
        return action.item(), m.log_prob(action)

In [4]:
class ValueNetwork(nn.Module):

    def __init__(self, state_size, hidden_size, args):
        super(ValueNetwork, self).__init__()
        self.args = args
        
        self.fc = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)


    def forward(self, x):
        return self.fc(x)

In [5]:
class Agent:
    def __init__(self, args, type):
        self.args = args
        self.args.type = type
        self.env = gym.make(args.env + '-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n

        self.policy = PolicyNetwork(self.state_size, self.action_size, args.hidden_size, self.args).to(self.args.device)
        self.init_policy_weights = self.policy.state_dict().copy()
        self.value = ValueNetwork(self.state_size, args.hidden_size, self.args).to(self.args.device)
        self.init_value_weights = self.value.state_dict().copy()

        self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=self.args.lr)
        self.optimizer_value = optim.Adam(self.value.parameters(), lr=self.args.lr)

    def learn(self, batch):
        batch_state, batch_action, batch_reward, batch_log_prob = zip(*batch)
        batch_state = torch.stack(batch_state, dim=0).float().to(self.args.device)
        batch_log_prob = torch.tensor(batch_log_prob, dtype=torch.float, requires_grad=True).to(self.args.device)

        G = []
        total = 0
        for r in batch_reward[::-1]:
            total = r + self.args.gamma * total
            G.insert(0, total)
        G = torch.tensor(G, dtype=torch.float32).to(self.args.device)
        G = (G - G.mean()) / (G.std() + 1e-8)
        G = G.unsqueeze(1)

        self.value.train()
        if self.args.type == 'wbaseline':
            value = self.value(batch_state)
            loss_value = F.mse_loss(value, G)
            self.optimizer_value.zero_grad()
            loss_value.backward()
            self.optimizer_value.step()

            deltas = G - value.detach()
            loss_policy = (-batch_log_prob * deltas).sum()
        elif self.args.type == 'wobaseline':
            deltas = G
            loss_policy = (-batch_log_prob * deltas).sum()
        
        self.value.eval()
        self.optimizer_policy.zero_grad()
        loss_policy.backward()
        self.optimizer_policy.step()

    def train(self):
        run_rewards = []    
        for run in tqdm.trange(self.args.num_runs, desc='Runs'):
            self.policy.init_weights()
            self.value.init_weights()

            self.policy.train()
            if self.args.type == 'wbaseline':
                self.value.train()
            elif self.args.type == 'wobaseline':
                self.value.eval()
            else:
                raise NotImplementedError(f'Unknown type: {self.args.type}')

            episode_rewards = []
            for episode in tqdm.trange(self.args.num_episodes, desc='Episodes', leave=False):
                state = self.env.reset()
                total_reward = 0
                trajectory = []

                for t in count():
                    state = torch.tensor(state, dtype=torch.float32).to(self.args.device).unsqueeze(0)
                    action, log_prob = self.policy.select_action(state)
                    next_state, reward, done, _ = self.env.step(action)

                    total_reward += reward
                    trajectory.append((state.flatten(), action, reward, log_prob.flatten()))

                    if done or t > self.args.max_steps:
                        break        
                    state = next_state

                episode_rewards.append(total_reward)

                if np.mean(episode_rewards) > self.args.threshold:
                    break
                self.learn(trajectory)

            run_rewards.append(episode_rewards)
        return run_rewards
            

## Simulation

In [6]:
# Cartpole Type 1

args = Args()
set_seed(args.seed)
args.env = 'CartPole'
agent = Agent(args, 'wobaseline')
run_rewards_type1 = agent.train()
np.save(f'result/cartpole_mcreinforce_wobaseline.npy', np.array(run_rewards_type1))

Runs:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# Cartpole Type 2

args = Args()
set_seed(args.seed)
args.env = 'CartPole'
agent = Agent(args, 'wbaseline')
run_rewards_type2 = agent.train()
np.save(f'result/cartpole_mcreinforce_wbaseline.npy', np.array(run_rewards_type2))

In [None]:
window_size = 10
moving_avg_type1 = np.convolve(np.mean(run_rewards_type1, axis=0), np.ones(window_size), 'valid') / window_size
moving_std_type1 = np.convolve(np.std(run_rewards_type1, axis=0), np.ones(window_size), 'valid') / window_size

plt.plot(moving_avg_type1, label='Type 1', color='r')
plt.fill_between(range(len(moving_avg_type1)), moving_avg_type1 - moving_std_type1, moving_avg_type1 + moving_std_type1, alpha=0.1, color='r')

moving_avg_type2 = np.convolve(np.mean(run_rewards_type2, axis=0), np.ones(window_size), 'valid') / window_size
moving_std_type2 = np.convolve(np.std(run_rewards_type2, axis=0), np.ones(window_size), 'valid') / window_size
plt.plot(moving_avg_type2, label='Type 2', color='b')
plt.fill_between(range(len(moving_avg_type2)), moving_avg_type2 - moving_std_type2, moving_avg_type2 + moving_std_type2, alpha=0.1, color='b')

plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()

In [None]:
# Acrobot Type 1

args = Args()
set_seed(args.seed)
args.env = 'Acrobot'
agent = Agent(args, 'wobaseline')
run_rewards_type1 = agent.train()
np.save(f'result/acrobot_mcreinforce_wobaseline.npy', np.array(run_rewards_type1))



In [None]:
# Acrobot Type 2

args = Args()
set_seed(args.seed)
args.env = 'Acrobot'
agent = Agent(args, 'wbaseline')
run_rewards_type2 = agent.train()
np.save(f'result/actobot_mcreinforce_wbaseline.npy', np.array(run_rewards_type2))

In [None]:
window_size = 10
moving_avg_type1 = np.convolve(np.mean(run_rewards_type1, axis=0), np.ones(window_size), 'valid') / window_size
moving_std_type1 = np.convolve(np.std(run_rewards_type1, axis=0), np.ones(window_size), 'valid') / window_size

plt.plot(moving_avg_type1, label='Type 1', color='r')
plt.fill_between(range(len(moving_avg_type1)), moving_avg_type1 - moving_std_type1, moving_avg_type1 + moving_std_type1, alpha=0.1, color='r')

moving_avg_type2 = np.convolve(np.mean(run_rewards_type2, axis=0), np.ones(window_size), 'valid') / window_size
moving_std_type2 = np.convolve(np.std(run_rewards_type2, axis=0), np.ones(window_size), 'valid') / window_size
plt.plot(moving_avg_type2, label='Type 2', color='b')
plt.fill_between(range(len(moving_avg_type2)), moving_avg_type2 - moving_std_type2, moving_avg_type2 + moving_std_type2, alpha=0.1, color='b')

plt.xlabel('Episodes')
plt.ylabel('Reward')
plt.legend()