<a href="https://colab.research.google.com/github/SebastianLarssonDTU/02456-Reinforcement-Learning-Project/blob/timed_training/getting_started_ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import imageio

from datetime import datetime
from pytz import timezone 
import time

import glob

# Getting started with PPO and ProcGen

Here's a bit of code that should help you get started on your projects.

The cell below installs `procgen` and downloads a small `utils.py` script that contains some utility functions. You may want to inspect the file for more details.

In [None]:
!pip install procgen
!wget https://raw.githubusercontent.com/nicklashansen/ppo-procgen-utils/main/utils.py

# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#check folders/files in drive
!ls drive/'My Drive'/'02456-Deep-Learning-Project'
DATA_PATH = 'drive/My Drive/02456-Deep-Learning-Project/Data/'
MODEL_PATH = 'drive/My Drive/02456-Deep-Learning-Project/Models/'

In [None]:
def test_ability_to_create_file_on_drive():
  #Check we can create a log file
  columns = ['Step', 'Mean reward']
  dummy_df = pd.DataFrame(np.random.randn(100, 2), columns=columns)
  dummy_df
  #Using , as seperator so I could open in google sheets and verify data
  dummy_df.to_csv(DATA_PATH+'/dummy_test.csv', index=None, sep=',', mode='w')
  
  f = open(DATA_PATH+'/dummy_test.csv', "a")
  f.write("\n I can also append")
  f.close()

#test_ability_to_create_file_on_drive()

# Helper functions

In [None]:
# #Copied from https://lab-ml.com/labml_nn/rl/ppo/

# def ClippedPPOLoss(log_pi: torch.Tensor, sampled_log_pi: torch.Tensor, advantage: torch.Tensor, clip: float) -> torch.Tensor:
#   ratio = torch.exp(log_pi - sampled_log_pi)
#   clipped_ratio = ratio.clamp(min=1.0 - clip, max=1.0 + clip)
#   policy_reward = torch.min(ratio * advantage, clipped_ratio * advantage)
#   #clip_fraction = (abs((ratio - 1.0)) > clip).to(torch.float).mean()
#   return -policy_reward.mean()

# def ClippedValueFunctionLoss(value: torch.Tensor, sampled_value: torch.Tensor, sampled_return: torch.Tensor, clip: float):
#   clipped_value = sampled_value + (value - sampled_value).clamp(min=-clip, max=clip)
#   vf_loss = torch.max((value - sampled_return) ** 2, (clipped_value - sampled_return) ** 2)
#   return 0.5 * vf_loss.mean()


def ClippedPPOLoss(advantage, log_pi, log_old_pi, eps):
  ratio = torch.exp(log_pi - log_old_pi)
  clipped_ratio = torch.clip(ratio, 1-eps, 1+eps)
  clipped_reward = torch.min(ratio*advantage, clipped_ratio*advantage)
  return clipped_reward.mean() 

def ValueFunctionLoss(new_value, old_value):
  return ((new_value-old_value)**2).mean() 
  

In [None]:
def extract_data_from_csv(file_name):
  df = pd.read_csv(DATA_PATH+file_name)
  return df

In [None]:
def create_index_table_from_txt_files():
  all_txt_files = glob.glob(DATA_PATH +'*.txt')
  final_df = pd.DataFrame()
  
  for file in all_txt_files:
    df=pd.read_csv(file)
    df = df.set_index('Parameter name')
    df = df.transpose()
    final_df = final_df.append(df)
  return final_df

#TODO: Is this the right result?
def update_index_file_with_result(df):
  df['Last Mean Reward'] = ""
  for i in range(len(df)):
    name = df['file_name'][i].strip()
    #read csv file at DATA_PATH with current filname
    f = open(DATA_PATH + name +'.csv', "r")
    for last_line in f:
        pass
    f.close()

    _, reward = last_line.split(",") 
    #add to table
    df['Last Mean Reward'][i] = reward
  return df
  

#Baseline Hyper Params
Hyperparameters. These values should be a good starting point. You can modify them later once you have a working implementation.

In [None]:
feature_dim= 512    # <- The only thing we chose ourself

#Fixed values
in_channels = 3 #RGB
num_actions = 15  #Number of actions in the Procgen environment

In [None]:
#Hyperparameters from Notebook (Given by Nicklas)
def set_hyperparameters_to_baseline_Nicklas():
  global total_steps, num_envs, num_levels, num_steps, num_epochs, batch_size, eps, grad_eps, value_coef, entropy_coef, gamma, lmbda, lr, version
  total_steps = 8e6
  num_envs = 32
  num_levels = 10
  num_steps = 256
  num_epochs = 3
  batch_size = 512
  eps = .2
  grad_eps = .5
  value_coef = .5
  entropy_coef = .01
  gamma = 0.99
  lmbda = 0.95
  lr= 5e-4
  version='Baseline(Nicklas)'



In [None]:
#Hyperparameters inspired by PPO Paper ( without variable learning rate )
def set_hyperparameters_to_baseline_PPO():
  global total_steps, num_envs, num_levels, num_steps, num_epochs, batch_size, eps, grad_eps, value_coef, entropy_coef, gamma, lmbda, lr, version
  total_steps = 8e6
  num_envs = 32
  num_levels = 10
  num_steps = 128
  num_epochs = 3
  batch_size = 256
  eps = .1
  grad_eps = .5
  value_coef = 1
  entropy_coef = .01
  gamma = 0.99
  lmbda = 0.95
  lr = 2.5e-4
  version = 'Baseline(PPO)'

In [None]:
#Hyperparameters inspired by Procgen Paper (With 32 instead of 64 environments because of memory)
def set_hyperparameters_to_baseline_Procgen():
  global total_steps, num_envs, num_levels, num_steps, num_epochs, batch_size, eps, grad_eps, value_coef, entropy_coef, gamma, lmbda, lr , version
  total_steps = 8e6
  num_envs = 32
  num_levels = 10
  num_steps = 256
  num_epochs = 3
  batch_size = 512
  eps = .2
  grad_eps = .5
  value_coef = .5
  entropy_coef = .01
  gamma = 0.999
  lmbda = 0.95
  lr= 5e-4
  version = 'Baseline(Procgen)'

In [None]:
def set_hyperparameters(baseline='Procgen'):
  implemented_baselines = {}
  implemented_baselines['Procgen'] = set_hyperparameters_to_baseline_Procgen
  implemented_baselines['PPO'] = set_hyperparameters_to_baseline_PPO

  if baseline not in implemented_baselines.keys():
    raise NotImplementedError("The implemented baselines are: {}".format(implemented_baselines.keys()))
  else:
    implemented_baselines[baseline]()

# Network definitions 
We have defined a policy network for you in advance. It uses the popular `NatureDQN` encoder architecture (see below), while policy and value functions are linear projections from the encodings. There is plenty of opportunity to experiment with architectures, so feel free to do that! Perhaps implement the `Impala` encoder from [this paper](https://arxiv.org/pdf/1802.01561.pdf) (perhaps minus the LSTM).

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils import make_env, Storage, orthogonal_init


class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


class Encoder(nn.Module):
  def __init__(self, in_channels, feature_dim):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4), 
        nn.ReLU(),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 
        nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 
        nn.ReLU(),
        Flatten(),
        nn.Linear(in_features=1024, out_features=feature_dim), 
        nn.ReLU()
    )
    self.apply(orthogonal_init)

  def forward(self, x):
    return self.layers(x)


class Policy(nn.Module):
  def __init__(self, encoder, feature_dim, num_actions):
    super().__init__()
    self.encoder = encoder
    self.policy = orthogonal_init(nn.Linear(feature_dim, num_actions), gain=.01)
    self.value = orthogonal_init(nn.Linear(feature_dim, 1), gain=1.)

  def act(self, x):
    with torch.no_grad():
      x = x.cuda().contiguous()
      dist, value = self.forward(x)
      action = dist.sample()
      log_prob = dist.log_prob(action)
    
    return action.cpu(), log_prob.cpu(), value.cpu()

  def forward(self, x):
    x = self.encoder(x)
    logits = self.policy(x)
    value = self.value(x).squeeze(1)
    #Creates a categorical distribution parameterized by either probs or logits
    dist = torch.distributions.Categorical(logits=logits)
    #sample with dist.sample()
    return dist, value

# Training definition

In [None]:
"""
  TODO: 
    1. Maybe update this to take hyperparams as input 
    2. Update to do data logging in files instead of as prints
"""
def train_network(print_output=False, 
                  file_name=None,
                  total_steps = 8e6,
                  num_envs = 32,
                  num_levels = 10,
                  num_steps = 256,
                  num_epochs = 3,
                  batch_size = 512,
                  eps = .2,
                  grad_eps = .5,
                  value_coef = .5,
                  entropy_coef = .01,
                  lr=5e-4,
                  opt_extra = 1e-5,
                  gamma=0.99,
                  lmbda = 0.95, 
                  version = '',
                  optimizer = 'Adam'):    #does not change anything, just for logging purposes
  
  if file_name is None:
    now = datetime.now(timezone('Europe/Copenhagen'))
    file_name = version+'_Run_' + now.strftime("%d%b_%Hh%Mm%Ss")
    
  hyper_params={}
  
  for key, val in reversed(list(locals().items())):
    if key in ['print_output', 'hyper_params', 'now']:
      continue
    hyper_params[key] = val


  
  #create file
  f = open(DATA_PATH+file_name+'.csv', "w")
  f.write("Step, Mean reward")
  f.close()

  #create txt file for hyper params
  f = open(DATA_PATH+file_name+'.txt', "w")
  f.write("Parameter name, Value")
  for key, val in hyper_params.items():  
    f.write("\n{}, {}".format(key, val))
  f.close()

  # Define environment
  # check the utils.py file for info on arguments
  env = make_env(num_envs, num_levels=num_levels)
  if print_output:
    print('Observation space:', env.observation_space)
    print('Action space:', env.action_space.n)

  encoder = Encoder(in_channels = in_channels, feature_dim = feature_dim)
  policy = Policy(encoder = encoder, feature_dim = feature_dim, num_actions = num_actions)
  policy.cuda()

  # Define optimizer
  # these are reasonable values but probably not optimal
  optimizer = torch.optim.Adam(policy.parameters(), lr=lr, eps=opt_extra) #OBS: Remember to change dummy in input too for logging purposes

  # Define temporary storage
  # we use this to collect transitions during each iteration
  storage = Storage(
      env.observation_space.shape,
      num_steps,
      num_envs,
      gamma = gamma,
      lmbda = lmbda
  )

  # Run training
  obs = env.reset()
  step = 0
  while step < total_steps:

    # Use policy to collect data for num_steps steps
    policy.eval()
    for _ in range(num_steps):
      # Use policy
      action, log_prob, value = policy.act(obs)
      
      # Take step in environment
      next_obs, reward, done, info = env.step(action)

      # Store data
      storage.store(obs, action, reward, done, info, log_prob, value)
      
      # Update current observation
      obs = next_obs

    # Add the last observation to collected data
    _, _, value = policy.act(obs)
    storage.store_last(obs, value)

    # Compute return and advantage
    storage.compute_return_advantage()

    # Optimize policy
    policy.train()
    for epoch in range(num_epochs):

      # Iterate over batches of transitions
      generator = storage.get_generator(batch_size)
      for batch in generator:
        #Results from using old policy on environment
        b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

        # Get current policy outputs
        new_dist, new_value = policy(b_obs)
        new_log_prob = new_dist.log_prob(b_action)

        # Clipped policy objective
        pi_loss = ClippedPPOLoss(advantage=b_advantage, 
                                 log_pi=new_log_prob, 
                                 log_old_pi=b_log_prob, 
                                 eps=eps)


        # # Clipped value function objective
        # #Assume value_loss = ClippedValueFunctionLoss 
        value_loss= ValueFunctionLoss(new_value=new_value, 
                                      old_value= b_value)

        # Entropy loss
        entropy_loss = new_dist.entropy().mean()

        # Backpropagate losses
        loss = -(pi_loss - value_coef * value_loss + entropy_coef*entropy_loss)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(policy.parameters(), grad_eps)

        # Update policy
        optimizer.step()
        optimizer.zero_grad()

    # Update stats
    step += num_envs * num_steps
    if print_output:
      print(f'Step: {step}\tMean reward: {storage.get_reward()}')
    f = open(DATA_PATH+file_name+'.csv', "a")
    f.write("\n{}, {}".format(step, storage.get_reward()))
    f.close()

  if print_output:
    print('Completed training!')
  torch.save(policy.state_dict, MODEL_PATH + file_name+'.pt')
  

# Timed Training

In [None]:
def timed_training(print_output=False, 
                  file_name=None,
                  total_steps = 8e6,
                  num_envs = 32,
                  num_levels = 10,
                  num_steps = 256,
                  num_epochs = 3,
                  batch_size = 512,
                  eps = .2,
                  grad_eps = .5,
                  value_coef = .5,
                  entropy_coef = .01,
                  lr=5e-4,
                  opt_extra = 1e-5,
                  gamma=0.99,
                  lmbda = 0.95, 
                  version = '',
                  optimizer = 'Adam',
                  time_limit_hours = 0,
                  time_limit_minutes = 30,
                  time_limit_seconds = 0):   #They will be added together
  
  if file_name is None:
    now = datetime.now(timezone('Europe/Copenhagen'))
    file_name = version+'_Run_' + now.strftime("%d%b_%Hh%Mm%Ss")
    
  hyper_params={}
  
  for key, val in reversed(list(locals().items())):
    if key in ['print_output', 'hyper_params', 'now', 'time_limit_hours', 'time_limit_seconds', 'time_limit_minutes']:
      continue
    hyper_params[key] = val


  
  #create file
  f = open(DATA_PATH+file_name+'.csv', "w")
  f.write("Step, Mean reward")
  f.close()

  #create txt file for hyper params
  f = open(DATA_PATH+file_name+'.txt', "w")
  f.write("Parameter name, Value")
  for key, val in hyper_params.items():  
    f.write("\n{}, {}".format(key, val))
  f.close()

  #start time
  start_time = time.time()

  # Define environment
  # check the utils.py file for info on arguments
  env = make_env(num_envs, num_levels=num_levels)
  if print_output:
    print('Observation space:', env.observation_space)
    print('Action space:', env.action_space.n)

  encoder = Encoder(in_channels = in_channels, feature_dim = feature_dim)
  policy = Policy(encoder = encoder, feature_dim = feature_dim, num_actions = num_actions)
  policy.cuda()

  # Define optimizer
  # these are reasonable values but probably not optimal
  optimizer = torch.optim.Adam(policy.parameters(), lr=lr, eps=opt_extra) #OBS: Remember to change dummy in input too for logging purposes

  # Define temporary storage
  # we use this to collect transitions during each iteration
  storage = Storage(
      env.observation_space.shape,
      num_steps,
      num_envs,
      gamma = gamma,
      lmbda = lmbda
  )

  # Run training
  obs = env.reset()
  step = 0

  #calculate time limit in seconds
  time_limit = 60*60*time_limit_hours+60*time_limit_minutes + time_limit_seconds
  f = open(DATA_PATH+file_name+'.txt', "a")
  f.write("\n Time limit, {:.0f}:{:.0f}:{:.0f}".format(time_limit_hours, time_limit_minutes, time_limit_seconds))
  f.close()


  while step < total_steps:

    #Test time_limit
    now = time.time()
    if now-start_time > time_limit:
      end_time = now
      #Add to log file
      f = open(DATA_PATH+file_name+'.txt', "a")
      f.write("\n Time spent (in seconds), {:.2f}".format(end_time-start_time))
      f.write("\n Steps taken, {}".format(step))
      f.write("\n Done, False")
      f.close()
      torch.save(policy.state_dict, MODEL_PATH + file_name+'.pt')
      return

    # Use policy to collect data for num_steps steps
    policy.eval()
    for _ in range(num_steps):
      # Use policy
      action, log_prob, value = policy.act(obs)
      
      # Take step in environment
      next_obs, reward, done, info = env.step(action)

      # Store data
      storage.store(obs, action, reward, done, info, log_prob, value)
      
      # Update current observation
      obs = next_obs

    # Add the last observation to collected data
    _, _, value = policy.act(obs)
    storage.store_last(obs, value)

    # Compute return and advantage
    storage.compute_return_advantage()

    # Optimize policy
    policy.train()
    for epoch in range(num_epochs):

      # Iterate over batches of transitions
      generator = storage.get_generator(batch_size)
      for batch in generator:
        #Results from using old policy on environment
        b_obs, b_action, b_log_prob, b_value, b_returns, b_advantage = batch

        # Get current policy outputs
        new_dist, new_value = policy(b_obs)
        new_log_prob = new_dist.log_prob(b_action)

        # Clipped policy objective
        pi_loss = ClippedPPOLoss(advantage=b_advantage, 
                                 log_pi=new_log_prob, 
                                 log_old_pi=b_log_prob, 
                                 eps=eps)


        # # Clipped value function objective
        # #Assume value_loss = ClippedValueFunctionLoss 
        value_loss= ValueFunctionLoss(new_value=new_value, 
                                      old_value= b_value)

        # Entropy loss
        entropy_loss = new_dist.entropy().mean()

        # Backpropagate losses
        loss = -(pi_loss - value_coef * value_loss + entropy_coef*entropy_loss)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(policy.parameters(), grad_eps)

        # Update policy
        optimizer.step()
        optimizer.zero_grad()

    # Update stats
    step += num_envs * num_steps
    if print_output:
      print(f'Step: {step}\tMean reward: {storage.get_reward()}')
    f = open(DATA_PATH+file_name+'.csv', "a")
    f.write("\n{}, {}".format(step, storage.get_reward()))
    f.close()

  if print_output:
    print('Completed training!')

  #Add to log file
  end_time = time.time()
  f = open(DATA_PATH+file_name+'.txt', "a")
  f.write("\n Time spent (in seconds), {:.2f}".format(end_time-start_time))
  f.write("\n Steps taken, {}".format(step))
  f.write("\n Done, True")
  f.close()
  torch.save(policy.state_dict, MODEL_PATH + file_name+'.pt')
    

#Do Training

In [None]:
timed_training(total_steps=4e4, version='test', print_output= True, time_limit_minutes=0, time_limit_seconds=30)

# set_hyperparameters(baseline='Procgen')
# train_network(total_steps = total_steps,
#               num_envs = num_envs,
#               num_levels = num_levels,
#               num_steps = num_steps,
#               num_epochs = num_epochs,
#               batch_size = batch_size,
#               eps = eps,
#               grad_eps = grad_eps,
#               value_coef = value_coef,
#               entropy_coef = entropy_coef,
#               gamma = gamma,
#               lmbda = lmbda,
#               lr= lr,
#               version = version,
#               print_output=True)

In [None]:
df = create_index_table_from_txt_files()
update_index_file_with_result(df)
# df[["file_name", "Last Mean Reward"]]

# Post training processing 
Below cell can be used for policy evaluation and saves an episode to mp4 for you to view.

In [None]:
def policy_evaluation(video_name='vid', print_output=False):
  # Make evaluation environment
  eval_env = make_env(num_envs, start_level=num_levels, num_levels=num_levels)
  obs = eval_env.reset()

  frames = []
  total_reward = []

  # Evaluate policy
  policy.eval()
  for _ in range(512):

    # Use policy
    action, log_prob, value = policy.act(obs)

    # Take step in environment
    obs, reward, done, info = eval_env.step(action)
    total_reward.append(torch.Tensor(reward))

    # Render environment and store
    frame = (torch.Tensor(eval_env.render(mode='rgb_array'))*255.).byte()
    frames.append(frame)

  # Calculate average return
  total_reward = torch.stack(total_reward).sum(0).mean(0)
  if print_output:
    print('Average return:', total_reward)

  # Save frames as video
  frames = torch.stack(frames)
  imageio.mimsave(video_name+'.mp4', frames, fps=25)
  
  return total_reward

# Grid search ?
