<a href="https://colab.research.google.com/github/SebastianLarssonDTU/02456-Reinforcement-Learning-Project/blob/remove_fixed_values/getting_started_ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting started with PPO and ProcGen

Here's a bit of code that should help you get started on your projects.

The cell below installs `procgen` and downloads a small `utils.py` script that contains some utility functions. You may want to inspect the file for more details.

In [2]:
!pip install procgen
!wget https://raw.githubusercontent.com/nicklashansen/ppo-procgen-utils/main/utils.py

Collecting procgen
[?25l  Downloading https://files.pythonhosted.org/packages/d6/34/0ae32b01ec623cd822752e567962cfa16ae9c6d6ba2208f3445c017a121b/procgen-0.10.4-cp36-cp36m-manylinux2010_x86_64.whl (39.9MB)
[K     |████████████████████████████████| 39.9MB 76kB/s 
Collecting gym3<1.0.0,>=0.3.3
[?25l  Downloading https://files.pythonhosted.org/packages/89/8c/83da801207f50acfd262041e7974f3b42a0e5edd410149d8a70fd4ad2e70/gym3-0.3.3-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 7.4MB/s 
Collecting imageio<3.0.0,>=2.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/6e/57/5d899fae74c1752f52869b613a8210a2480e1a69688e65df6cb26117d45d/imageio-2.9.0-py3-none-any.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 38.2MB/s 
[?25hCollecting moderngl<6.0.0,>=5.5.4
[?25l  Downloading https://files.pythonhosted.org/packages/56/ab/5f72a1b7c5bdbb17160c85e8ba855d48925c74ff93c1e1027d5ad40bf33c/moderngl-5.6.2-cp36-cp36m-manylinux1_x86_64.whl (664kB)
[K   

# Helper functions and imports

In [20]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import imageio

In [4]:
#Copied from https://lab-ml.com/labml_nn/rl/ppo/

def ClippedPPOLoss(log_pi: torch.Tensor, sampled_log_pi: torch.Tensor, advantage: torch.Tensor, clip: float) -> torch.Tensor:
  ratio = torch.exp(log_pi - sampled_log_pi)
  clipped_ratio = ratio.clamp(min=1.0 - clip, max=1.0 + clip)
  policy_reward = torch.min(ratio * advantage, clipped_ratio * advantage)
  #clip_fraction = (abs((ratio - 1.0)) > clip).to(torch.float).mean()
  return -policy_reward.mean()

def ClippedValueFunctionLoss(value: torch.Tensor, sampled_value: torch.Tensor, sampled_return: torch.Tensor, clip: float):
  clipped_value = sampled_value + (value - sampled_value).clamp(min=-clip, max=clip)
  vf_loss = torch.max((value - sampled_return) ** 2, (clipped_value - sampled_return) ** 2)
  return 0.5 * vf_loss.mean()

# Mounting Drive

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
#check folders/files in drive
!ls drive/'My Drive'/'02456-Deep-Learning-Project'
data_path = 'drive/My Drive/02456-Deep-Learning-Project/Data'

Data


In [19]:
def test_ability_to_create_file_on_drive():
  #Check we can create a log file
  columns = ['Step', 'Mean reward']
  dummy_df = pd.DataFrame(np.random.randn(100, 2), columns=columns)
  dummy_df
  #Using , as seperator so I could open in google sheets and verify data
  dummy_df.to_csv(data_path+'/dummy_test.csv', index=None, sep=',', mode='w')
  
  f = open(data_path+'/dummy_test.csv', "a")
  f.write("\n I can also append")
  f.close()

test_ability_to_create_file_on_drive()

#Hyper Params
Hyperparameters. These values should be a good starting point. You can modify them later once you have a working implementation.

In [6]:
# Hyperparameters
total_steps = 8e6
num_envs = 32
num_levels = 10
num_steps = 256
num_epochs = 3
batch_size = 512
eps = .2
grad_eps = .5
value_coef = .5
entropy_coef = .01

feature_dim= 512    # <- The only thing we chose ourself

#Fixed values
in_channels = 3 #RGB
num_actions = 15  #Number of actions in the Procgen environment



# Network definitions 
We have defined a policy network for you in advance. It uses the popular `NatureDQN` encoder architecture (see below), while policy and value functions are linear projections from the encodings. There is plenty of opportunity to experiment with architectures, so feel free to do that! Perhaps implement the `Impala` encoder from [this paper](https://arxiv.org/pdf/1802.01561.pdf) (perhaps minus the LSTM).

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils import make_env, Storage, orthogonal_init


class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


class Encoder(nn.Module):
  def __init__(self, in_channels, feature_dim):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4), 
        nn.ReLU(),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 
        nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 
        nn.ReLU(),
        Flatten(),
        nn.Linear(in_features=1024, out_features=feature_dim), 
        nn.ReLU()
    )
    self.apply(orthogonal_init)

  def forward(self, x):
    return self.layers(x)


class Policy(nn.Module):
  def __init__(self, encoder, feature_dim, num_actions):
    super().__init__()
    self.encoder = encoder
    self.policy = orthogonal_init(nn.Linear(feature_dim, num_actions), gain=.01)
    self.value = orthogonal_init(nn.Linear(feature_dim, 1), gain=1.)

  def act(self, x):
    with torch.no_grad():
      x = x.cuda().contiguous()
      dist, value = self.forward(x)
      action = dist.sample()
      log_prob = dist.log_prob(action)
    
    return action.cpu(), log_prob.cpu(), value.cpu()

  def forward(self, x):
    x = self.encoder(x)
    logits = self.policy(x)
    value = self.value(x).squeeze(1)
    #Creates a categorical distribution parameterized by either probs or logits
    dist = torch.distributions.Categorical(logits=logits)
    #sample with dist.sample()
    return dist, value

# Training

In [9]:
"""
  TODO: 
    1. Maybe update this to take hyperparams as input 
    2. Update to do data logging in files instead of as prints
"""
def train_network(print_output=False):
  # Define environment
  # check the utils.py file for info on arguments
  env = make_env(num_envs, num_levels=num_levels)
  if print_output:
    print('Observation space:', env.observation_space)
    print('Action space:', env.action_space.n)

  encoder = Encoder(in_channels = in_channels, feature_dim = feature_dim)
  policy = Policy(encoder = encoder, feature_dim = feature_dim, num_actions = num_actions)
  policy.cuda()

  # Define optimizer
  # these are reasonable values but probably not optimal
  optimizer = torch.optim.Adam(policy.parameters(), lr=5e-4, eps=1e-5)

  # Define temporary storage
  # we use this to collect transitions during each iteration
  storage = Storage(
      env.observation_space.shape,
      num_steps,
      num_envs
  )

  # Run training
  obs = env.reset()
  step = 0
  while step < total_steps:

    # Use policy to collect data for num_steps steps
    policy.eval()
    for _ in range(num_steps):
      # Use policy
      action, log_prob, value = policy.act(obs)
      
      # Take step in environment
      next_obs, reward, done, info = env.step(action)

      # Store data
      storage.store(obs, action, reward, done, info, log_prob, value)
      
      # Update current observation
      obs = next_obs

    # Add the last observation to collected data
    _, _, value = policy.act(obs)
    storage.store_last(obs, value)

    # Compute return and advantage
    storage.compute_return_advantage()

    # Optimize policy
    policy.train()
    for epoch in range(num_epochs):

      # Iterate over batches of transitions
      generator = storage.get_generator(batch_size)
      for batch in generator:
        b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

        # Get current policy outputs
        new_dist, new_value = policy(b_obs)
        new_log_prob = new_dist.log_prob(b_action)

        # Clipped policy objective
        #Assume pi_loss = ClippedPPOLoss
        pi_loss = ClippedPPOLoss(log_pi= new_log_prob, 
                                sampled_log_pi=b_log_prob,
                                advantage=b_advantage, 
                                clip=eps)

        # Clipped value function objective
        #Assume value_loss = ClippedValueFunctionLoss 
        value_loss = ClippedValueFunctionLoss(value= new_value, 
                                              sampled_value=b_value, 
                                              sampled_return= b_returns, #prob not right!
                                              clip=eps)

        # Entropy loss
        entropy_loss = new_dist.entropy().mean()

        # Backpropagate losses
        loss = pi_loss + value_coef * value_loss - entropy_coef*entropy_loss
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(policy.parameters(), grad_eps)

        # Update policy
        optimizer.step()
        optimizer.zero_grad()

    # Update stats
    step += num_envs * num_steps
    if print_output:
      print(f'Step: {step}\tMean reward: {storage.get_reward()}')

  if print_output:
    print('Completed training!')
  torch.save(policy.state_dict, 'checkpoint.pt')

# Post training processing 
Below cell can be used for policy evaluation and saves an episode to mp4 for you to view.

In [None]:
def policy_evaluation(video_name='vid', print_output=False):
  # Make evaluation environment
  eval_env = make_env(num_envs, start_level=num_levels, num_levels=num_levels)
  obs = eval_env.reset()

  frames = []
  total_reward = []

  # Evaluate policy
  policy.eval()
  for _ in range(512):

    # Use policy
    action, log_prob, value = policy.act(obs)

    # Take step in environment
    obs, reward, done, info = eval_env.step(action)
    total_reward.append(torch.Tensor(reward))

    # Render environment and store
    frame = (torch.Tensor(eval_env.render(mode='rgb_array'))*255.).byte()
    frames.append(frame)

  # Calculate average return
  total_reward = torch.stack(total_reward).sum(0).mean(0)
  if print_output:
    print('Average return:', total_reward)

  # Save frames as video
  frames = torch.stack(frames)
  imageio.mimsave(video_name+'.mp4', frames, fps=25)
  
  return total_reward