## Run Policy Gradients

In [1]:
#@title imports

import os
import time

from cs285.infrastructure.rl_trainer import RL_Trainer
from cs285.agents.pg_agent import PGAgent

%load_ext autoreload
%autoreload 2

In [2]:
#@title runtime arguments

class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  def __contains__(self, key):
    return hasattr(self, key)

  env_name = 'LunarLanderContinuous-v2' #@param
  exp_name = 'q3_b40000_r0.005' #@param

  #@markdown main parameters of interest
  n_iter = 100 #@param {type: "integer"}

  ## PDF will tell you how to set ep_len
  ## and discount for each environment
  ep_len = 200 #@param {type: "integer"}
  discount = 0.99 #@param {type: "number"}

  reward_to_go = True #@param {type: "boolean"}
  nn_baseline = True #@param {type: "boolean"}
  gae_lambda = None #@param {type: "number"}
  dont_standardize_advantages = False #@param {type: "boolean"}

  #@markdown batches and steps
  batch_size = 40000 #@param {type: "integer"}
  eval_batch_size = 400 #@param {type: "integer"}

  num_agent_train_steps_per_iter = 1 #@param {type: "integer"}
  learning_rate =  5e-3 #@param {type: "number"}

  #@markdown MLP parameters
  n_layers = 2 #@param {type: "integer"}
  size = 64 #@param {type: "integer"}

  #@markdown system
  save_params = False #@param {type: "boolean"}
  no_gpu = False #@param {type: "boolean"}
  which_gpu = 0 #@param {type: "integer"}
  seed = 1 #@param {type: "integer"}
    
  action_noise_std = 0 #@param {type: "number"}

  #@markdown logging
  ## default is to not log video so
  ## that logs are small enough to be
  ## uploaded to gradscope
  video_log_freq =  -1#@param {type: "integer"}
  scalar_log_freq =  1#@param {type: "integer"}


args = Args()

## ensure compatibility with hw1 code
args['train_batch_size'] = args['batch_size']

if args['video_log_freq'] > 0:
  import warnings
  warnings.warn(
      '''\nLogging videos will make eventfiles too'''
      '''\nlarge for the autograder. Set video_log_freq = -1'''
      '''\nfor the runs you intend to submit.''')

In [3]:
#@title create directory for logging

data_path =r'D:\Code\RL-homework\hw2\data'

if not (os.path.exists(data_path)):
    os.makedirs(data_path)

logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

In [4]:
## define policy gradient trainer

class PG_Trainer(object):

    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'standardize_advantages': not(params['dont_standardize_advantages']),
            'reward_to_go': params['reward_to_go'],
            'nn_baseline': params['nn_baseline'],
            'gae_lambda': params['gae_lambda'],
        }

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
        }

        agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}

        self.params = params
        self.params['agent_class'] = PGAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )

In [5]:
## run training

print(args.logdir)
trainer = PG_Trainer(args)
trainer.run_training_loop()

D:\Code\RL-homework\hw2\data\q3_b40000_r0.005_LunarLanderContinuous-v2_09-09-2022_10-15-11
########################
logging outputs to  D:\Code\RL-homework\hw2\data\q3_b40000_r0.005_LunarLanderContinuous-v2_09-09-2022_10-15-11
########################
Using GPU id 0


********** Iteration 0 ************

Collecting data to be used for training...

Training agent using sampled data from replay buffer...

Beginning logging procedure...

Collecting data for eval...
Eval_AverageReturn : -256.7511291503906
Eval_StdReturn : 126.41618347167969
Eval_MaxReturn : -71.04524993896484
Eval_MinReturn : -397.1037902832031
Eval_AverageEpLen : 117.5
Train_AverageReturn : -325.2880554199219
Train_StdReturn : 158.91587829589844
Train_MaxReturn : -7.089210510253906
Train_MinReturn : -774.9459228515625
Train_AverageEpLen : 107.64247311827957
Train_EnvstepsSoFar : 40043
TimeSinceStart : 66.8566222190857
Training Loss : -0.0015695743495598435
Initial_DataCollection_AverageReturn : -325.2880554199219
Done log

In [6]:
#@markdown You can visualize your runs with tensorboard from within the notebook

## requires tensorflow==2.3.0
# %load_ext tensorboard
# %tensorboard --logdir D:/Code/RL-homework/hw2/data

In [7]:
# %reload_ext tensorboard
