# RND reference [https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/RND%20Montezuma's%20revenge%20PyTorch/agents.py](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/RND%20Montezuma's%20revenge%20PyTorch/agents.py)

In [0]:
!pip install gym
!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install stable-baselines==2.5.1 box2d box2d-kengz
!pip install torchcontrib

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
# import torchcontrib
# from torchcontrib.optim import SWA
from tqdm.notebook import tqdm
import numpy as np
import gym
from IPython import display as ipythondisplay
import matplotlib.pyplot as plt

In [0]:
class RNDBaseNetwork(nn.Module):
    def __init__(self):
        super(RNDBaseNetwork, self).__init__()
        self.conv1 = nn.Conv2d(1, 128, 3)
        self.conv2 = nn.Conv2d(128, 256, 3)
        self.pool = nn.MaxPool2d(2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.head = nn.Linear(256, 256)

        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.kaiming_normal_(p.weight, init.calculate_gain('leaky_relu'))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.kaiming_normal_(p.weight, init.calculate_gain('leaky_relu'))
                p.bias.data.zero_()

    def forward(self, x):
        x = self.pool(F.leaky_relu(self.conv1(x)))
        x = self.pool(F.leaky_relu(self.conv2(x)))
        x = self.avg_pool(x).reshape(x.shape[0], -1)
        x = self.head(x)
        return x

In [0]:
class RNDNetwork(nn.Module):
    def __init__(self):
        super(RNDNetwork, self).__init__()
        self.target = RNDBaseNetwork()
        self.predictor =  RNDBaseNetwork()

        for param in self.target.parameters():
            param.requires_grad = False
    
    def forward(self, x):
        predicted, target = self.predictor(x), self.target(x)
        return predicted, target
    
    def get_intrinsic_reward(self, state):
      predicted, target = self.predictor(state), self.target(state)
      intrinsic_reward = (predicted - target).pow(2).sum() / 2
      return intrinsic_reward.clamp(-5, 5).item()

In [0]:
class PolicyNetwork(nn.Module):
  def __init__(self, state_size, action_size, hidden_size = 256):

    super(PolicyNetwork, self).__init__()
    self.conv1 = nn.Conv2d(1, 64, 3)
    self.conv2 = nn.Conv2d(64, hidden_size, 3)
    self.pool = nn.MaxPool2d(2)
    self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))

    self.fc1 = nn.Linear(hidden_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.actor_network = nn.Linear(hidden_size, action_size)
    self.critic_network_extrinsic = nn.Linear(hidden_size, 1)
    self.critic_network_intrinsic = nn.Linear(hidden_size, 1)
    self.num_actions = action_size
    self.rnd_network = RNDNetwork()

    for p in self.modules():
        if isinstance(p, nn.Conv2d):
            init.kaiming_normal_(p.weight, init.calculate_gain('leaky_relu'))
            p.bias.data.zero_()

        if isinstance(p, nn.Linear):
            init.kaiming_normal_(p.weight, init.calculate_gain('leaky_relu'))
            p.bias.data.zero_()

  def get_intrinsic_reward(self, state):
      target_features, predictor_features = self.rnd_network(state)
      intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum() / 2
      return intrinsic_reward.item()

  def forward(self, state_representation):
    x = state_representation
    x = self.pool(F.leaky_relu(self.conv1(x)))
    x = self.pool(F.leaky_relu(self.conv2(x)))
    x = self.avg_pool(x).reshape(x.shape[0], -1)
    # print(x.shape)
    x = F.leaky_relu(self.fc1(x))
    x = F.leaky_relu(self.fc2(x))
    policy = self.actor_network(x)
    policy = F.softmax(policy, dim = 1)
    # x2 = F.leaky_relu(self.fc2(state_representation))
    value_extrinsic = self.critic_network_extrinsic(x)
    value_intrinsic = self.critic_network_intrinsic(x)
    return value_extrinsic, value_intrinsic, policy

  def get_action(self, state_representation):
    _, _, policy = self.forward(state_representation)
    # Get an action based on the probabilities returned from network
    weighted_probability_chosen_action = np.random.choice(self.num_actions, p=np.squeeze(policy.cpu().detach().numpy()))
    return weighted_probability_chosen_action

  def get_probability_of_actions(self, states, actions):
      values_extrinsic, values_intrinsic, policies = self.forward(states)
      action_probs = policies.gather(1, actions.unsqueeze(1))
      # Add small epsilon to prevent nans?
      # https://www.reddit.com/r/reinforcementlearning/comments/8k54mz/my_ppo_agent_collapse_to_a_single_value/
      entropy = -(policies * torch.log(policies + 1e-20)).sum(1)
      return values_extrinsic.squeeze(), values_intrinsic.squeeze(), action_probs.squeeze(), entropy

In [0]:
# Based on https://github.com/astooke/rlpyt/blob/master/rlpyt/algos/utils.py#L138
def get_advantages_and_returns(rewards, values, bootstrap_value, done_list):
    GAMMA = 0.99
    LAMBDA = 1

    advantages = torch.zeros(len(rewards)).cuda()
    returns = torch.zeros(len(rewards)).cuda()
    not_done = 1 - done_list
    advantages[-1] = rewards[-1] + GAMMA * bootstrap_value * not_done[-1] - values[-1]

    for i in reversed(range(len(rewards) - 1)):
        delta = rewards[i] + GAMMA * values[i + 1] * not_done[i] - values[i]
        advantages[i] = delta + GAMMA * LAMBDA * advantages[i + 1] * not_done[i]

    # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-20)

    returns[:] = advantages + values

    return advantages, returns

In [0]:
def get_discounted_rewards(rewards, bootstrap_value, done_list):
    GAMMA = 0.99

    discounted_rewards = torch.Tensor(len(rewards)).float()
    not_done = 1 - done_list

    discounted_rewards[-1] = rewards[-1] + GAMMA * bootstrap_value * not_done[-1]
    for i in reversed(range(len(rewards) - 1)):
        discounted_rewards[i] = rewards[i] + GAMMA * discounted_rewards[i+1] * not_done[i]

    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-20)

    return discounted_rewards

In [0]:
done_list = np.array([False, False, False, True, False]).astype(int)
rewards = torch.tensor([10, 20, 20, 0, 0]).float().cuda()
bootstrap = 10
values = torch.tensor([10, 20, 20, 0, 0]).float().cuda()
get_advantages_and_returns(rewards, values, bootstrap, done_list)

(tensor([39.4020, 19.8000,  0.0000,  0.0000,  9.9000], device='cuda:0'),
 tensor([49.4020, 39.8000, 20.0000,  0.0000,  9.9000], device='cuda:0'))

In [0]:
def make_ppo_update(states, next_states, actions, rewards_extrinsic, rewards_intrinsic, done_list, bootstrap_state, old_model, new_model, rnd_model, policy_optimizer, rnd_optimizer):
    NUM_PPO_UPDATES = 5
    RATIO_CLIP = 0.3

    _, _, old_action_probs, _ = old_model.get_probability_of_actions(states, actions)
    old_action_probs = old_action_probs.detach()

    for _ in range(NUM_PPO_UPDATES):
        boostrap_extrinsic_new_value, boostrap_intrinsic_new_value, _ = new_model(bootstrap_state)
        boostrap_extrinsic_new_value = boostrap_extrinsic_new_value.squeeze()
        boostrap_intrinsic_new_value = boostrap_intrinsic_new_value.squeeze()

        new_values_extrinsic, new_values_intrinsic, new_action_probs, new_entropy = new_model.get_probability_of_actions(states, actions)

        advantages_extrinsic, _ = get_advantages_and_returns(rewards_extrinsic, new_values_extrinsic, boostrap_extrinsic_new_value, done_list)
        advantages_intrinsic, _ = get_advantages_and_returns(rewards_intrinsic, new_values_intrinsic, boostrap_intrinsic_new_value, done_list)

        advantages = advantages_extrinsic + advantages_intrinsic

        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-20)

        ratios = new_action_probs / (old_action_probs + 1e-20)

        surr1 = ratios * advantages.detach()
        surr2 = torch.clamp(ratios, 1-RATIO_CLIP, 1+RATIO_CLIP) * advantages.detach()
        actor_loss = -torch.min(surr1, surr2)
        critic_loss = 0.5 * advantages**2
        loss = actor_loss + 0.5*critic_loss - 0.01 * new_entropy
        policy_optimizer.zero_grad()
        loss.mean().backward()
        torch.nn.utils.clip_grad_norm_(new_model.parameters(), 1)
        policy_optimizer.step()

        predicted_features, target_features = rnd_model(next_states)
        rnd_loss = (predicted_features - target_features).pow(2).sum(1) / 2

        rnd_optimizer.zero_grad()
        # print(rnd_loss.mean().item())
        rnd_loss.mean().backward()
        # torch.nn.utils.clip_grad_norm_(new_model.parameters(), 1)
        rnd_optimizer.step()

    old_model.load_state_dict(new_model.state_dict())

In [0]:
class Memory:
    def __init__(self):
        self.clear()

    def clear(self):
        self.states = []
        self.next_states = []
        self.rewards_extrinsic = []
        self.rewards_intrinsic = []
        self.actions = []
        self.done_list = []

In [0]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

# 1. Add Clipping to gradients
# 2. Increase the timesteps before making an update
# 3. Tensorboard support?
# 4. Add stack of frames to represent movement
# 5. Add reward / observation normalization
# 6. Add input normalization (is this needed after observation normalization?)
# 7. Figure out NaN issue. Possibly due to high entropy loss coefficient. 

In [0]:
NUM_EPISODES = 3000
MAX_EPISODE_STEPS = 15000
UPDATE_TIME_STEP = 2000

env = gym.make('Freeway-v0')
NUM_ACTIONS = env.action_space.n
NUM_STATE_DIMENSIONS = env.observation_space.shape[0]

torch.manual_seed(2048)
np.random.seed(2048)

episodic_rewards = []

iteration_memory = Memory()

old_policy_net = PolicyNetwork(NUM_STATE_DIMENSIONS, NUM_ACTIONS).cuda()
new_policy_net = PolicyNetwork(NUM_STATE_DIMENSIONS, NUM_ACTIONS).cuda()
rnd_network = RNDNetwork().cuda()

old_policy_net.load_state_dict(new_policy_net.state_dict())

policy_optimizer = torch.optim.Adam(new_policy_net.parameters(), lr=1e-3)
rnd_optimizer = torch.optim.Adam(rnd_network.parameters(), lr=1e-3)

iterations = 1

for episode in tqdm(range(NUM_EPISODES)):
    current_state = env.reset()
    current_state = torch.from_numpy(rgb2gray(current_state) / 255.).float().unsqueeze(0).unsqueeze(0).cuda()
    reward_list = []
    for iteration in range(MAX_EPISODE_STEPS):
        # print(_)
        chosen_action = old_policy_net.get_action(current_state)
        next_state, reward_extrinsic, done, _ = env.step(chosen_action)
        next_state = torch.from_numpy(rgb2gray(next_state)).float().unsqueeze(0).unsqueeze(0).cuda()
        
        reward_intrinsic = rnd_network.get_intrinsic_reward(next_state)
        # print(reward_intrinsic.shape)
        reward_list.append(reward_extrinsic)

        iteration_memory.states.append(current_state)
        iteration_memory.next_states.append(next_state)
        iteration_memory.actions.append(chosen_action)
        iteration_memory.rewards_extrinsic.append(reward_extrinsic)
        iteration_memory.rewards_intrinsic.append(reward_intrinsic)
        iteration_memory.done_list.append(done)

        bootstrap_state = next_state
        current_state = next_state

        if len(iteration_memory.states) == 32:
            states_tensor = torch.cat(iteration_memory.states, dim=0)
            # print(iteration_memory.rewards_intrinsic)
            next_states_tensor = torch.cat(iteration_memory.next_states, dim=0)
            action_tensor = torch.Tensor(iteration_memory.actions).long().cuda()
            rewards_intrinsic = iteration_memory.rewards_intrinsic
            rewards_extrinsic = iteration_memory.rewards_extrinsic

            done_list = np.array(iteration_memory.done_list).astype(int)
            make_ppo_update(states_tensor, next_states_tensor, action_tensor, rewards_extrinsic, rewards_intrinsic, done_list, bootstrap_state, old_policy_net, new_policy_net, rnd_network, policy_optimizer, rnd_optimizer)
            iteration_memory.clear()

        if done:
            break

    episodic_rewards.append(sum(reward_list))
    print("Episode length - {} with reward - {}".format(iteration, episodic_rewards[-1]))
    if episode % 100 == 0:
        episodic_rewards_smoothed = np.convolve(episodic_rewards, np.ones((100,))/100, mode='valid')
        print("reward at step - {} - {}".format(episode, episodic_rewards_smoothed[-1]))
        if episodic_rewards_smoothed[-1] > 200:
            break

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))

Episode length - 2728 with reward - 20.0
reward at step - 0 - 0.2
Episode length - 2728 with reward - 0.0
Episode length - 2694 with reward - 20.0
Episode length - 2744 with reward - 21.0
Episode length - 2736 with reward - 21.0
Episode length - 2714 with reward - 21.0
Episode length - 2742 with reward - 21.0
Episode length - 2729 with reward - 21.0
Episode length - 2743 with reward - 21.0
Episode length - 2719 with reward - 21.0
Episode length - 2744 with reward - 24.0
Episode length - 2734 with reward - 21.0
Episode length - 2722 with reward - 21.0
Episode length - 2715 with reward - 23.0
Episode length - 2717 with reward - 21.0
Episode length - 2743 with reward - 23.0
Episode length - 2733 with reward - 21.0
Episode length - 2707 with reward - 21.0
Episode length - 2745 with reward - 21.0
Episode length - 2733 with reward - 21.0
Episode length - 2692 with reward - 21.0
Episode length - 2715 with reward - 21.0
Episode length - 2733 with reward - 21.0
Episode length - 2732 with reward

In [0]:
%debug

In [0]:
episodic_rewards

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]