In [1]:
### Resources:

# Conjugate Gradient: https://en.wikipedia.org/wiki/Conjugate_gradient_method

# TRPO: https://spinningup.openai.com/en/latest/algorithms/trpo.html#id9

# TRPO Paper: https://arxiv.org/abs/1502.05477

In [2]:
import os
import gym
import torch
from torch import nn
import numpy as np
from spinup import trpo_tf1 as trpo
from spinup.utils.run_utils import setup_logger_kwargs

from models import MLPActorCritic, MLPActorDiscrete
from train import TRPOTrainer

In [3]:
### Test policy copying

obs_dim, act_dim = 3, 2
hidden_sizes, hidden_acts = [64, 64], nn.Tanh

actor_1 = MLPActorDiscrete(obs_dim, act_dim, hidden_sizes, hidden_acts)
actor_2 = MLPActorDiscrete(obs_dim, act_dim, hidden_sizes, hidden_acts)

# Test that they're different initially
input = torch.randn(obs_dim)
print('Before copying:\n')
print(f'Output 1 = {actor_1.net(input)}\t', f'Output 2 = {actor_2.net(input)}\n\n')

# Copy the new policy into the prev one
params_flat = nn.utils.parameters_to_vector(actor_1.net.parameters())
nn.utils.vector_to_parameters(params_flat, actor_2.net.parameters())
print(f'After policy copy:\n')
print(f'Output 1 = {actor_1.net(input)}\t', f'Output 2 = {actor_2.net(input)}\n\n')

# Update weights of new policy after deep copy
params_new = torch.randn_like(params_flat)
nn.utils.vector_to_parameters(params_new, actor_1.net.parameters())
print(f'After weight update for new policy:\n')
print(f'Output 1 = {actor_1.net(input)}\t', f'Output 2 = {actor_2.net(input)}\n\n')

Before copying:

Output 1 = tensor([-0.0695,  0.3696], grad_fn=<AddBackward0>)	 Output 2 = tensor([ 0.0169, -0.1111], grad_fn=<AddBackward0>)


After policy copy:

Output 1 = tensor([-0.0695,  0.3696], grad_fn=<AddBackward0>)	 Output 2 = tensor([-0.0695,  0.3696], grad_fn=<AddBackward0>)


After weight update for new policy:

Output 1 = tensor([-0.9688, -4.7047], grad_fn=<AddBackward0>)	 Output 2 = tensor([-0.0695,  0.3696], grad_fn=<AddBackward0>)




In [4]:
### Test automatic gradient calculation using autograd

# Initialize actor
obs_dim, act_dim = 3, 2
hidden_sizes, hidden_acts = [64, 64], nn.Tanh
actor = MLPActorDiscrete(obs_dim, act_dim, hidden_sizes, hidden_acts)

# Calculate the KL-Divergence
obs = np.random.randn(400, 3)
act = actor.forward(torch.as_tensor(obs, dtype=torch.float32))
logp = actor.log_prob_grad(torch.as_tensor(obs, dtype=torch.float32), 
                           torch.as_tensor(act, dtype=torch.float32))
kl = actor.kl_divergence_grad(torch.as_tensor(obs, dtype=torch.float32))
print(kl, kl.requires_grad, '\n\n')

# Calculate the gradient of KL w.r.t. model parameters
print('\nFlattening gradients:\n')
grad = torch.autograd.grad(kl, actor.net.parameters(), create_graph=True)
grad = torch.cat([g.flatten() for g in grad])
x_vec = torch.ones_like(grad)

def get_grad_2(grad_flat: torch.Tensor, x: torch.Tensor, actor: MLPActorDiscrete, retain=False):
    output_2 = torch.dot(grad_flat, x)
    grad_2 = torch.autograd.grad(output_2, actor.net.parameters(), retain_graph=retain)
    
    return grad_2

for i in range(3):
    grad_2 = get_grad_2(grad, x_vec, actor, retain=(i!=2))
    print(len(grad_2), grad_2[0][:6], '\n\n')


tensor(0., grad_fn=<MeanBackward0>) True 



Flattening gradients:

6 tensor([[-2.3952e-04,  7.7479e-03,  6.5625e-03],
        [ 3.2518e-05, -1.0357e-03, -6.2778e-04],
        [-1.6083e-04, -9.1544e-04, -4.8676e-04],
        [ 1.0323e-03,  6.2841e-03,  5.8161e-03],
        [ 3.0933e-05,  5.0234e-03,  3.2694e-03],
        [ 2.2753e-04, -4.3374e-03, -3.6200e-03]]) 


6 tensor([[-2.3952e-04,  7.7479e-03,  6.5625e-03],
        [ 3.2518e-05, -1.0357e-03, -6.2778e-04],
        [-1.6083e-04, -9.1544e-04, -4.8676e-04],
        [ 1.0323e-03,  6.2841e-03,  5.8161e-03],
        [ 3.0933e-05,  5.0234e-03,  3.2694e-03],
        [ 2.2753e-04, -4.3374e-03, -3.6200e-03]]) 


6 tensor([[-2.3952e-04,  7.7479e-03,  6.5625e-03],
        [ 3.2518e-05, -1.0357e-03, -6.2778e-04],
        [-1.6083e-04, -9.1544e-04, -4.8676e-04],
        [ 1.0323e-03,  6.2841e-03,  5.8161e-03],
        [ 3.0933e-05,  5.0234e-03,  3.2694e-03],
        [ 2.2753e-04, -4.3374e-03, -3.6200e-03]]) 




In [14]:
### Test entropy calculation for Categorical distribution

# Initialize actor
obs_dim, act_dim = 3, 2
hidden_sizes, hidden_acts = [64, 64], nn.Tanh
actor = MLPActorDiscrete(obs_dim, act_dim, hidden_sizes, hidden_acts)

# Calculate the Entropy of the policy
obs = np.random.randn(40, 3)
act = actor.forward(torch.as_tensor(obs, dtype=torch.float32))
logp = actor.log_prob_grad(torch.as_tensor(obs, dtype=torch.float32), 
                           torch.as_tensor(act, dtype=torch.float32))
entropy = actor.pi.entropy()

print(f'First 5 elements of entropy {entropy[:5]}\n', f'Entropy shape {entropy.shape}\n', f'Entropy mean {entropy.mean()}')

First 5 elements of entropy tensor([0.6818, 0.6860, 0.6663, 0.6868, 0.6928], grad_fn=<SliceBackward>)
 Entropy shape torch.Size([40])
 Entropy mean 0.6889864206314087


In [5]:
### Test manual parameter updates for NNs

mlp_net = nn.Sequential(nn.Linear(2, 6), nn.ReLU(), nn.Linear(6, 1))
params_flat = nn.utils.parameters_to_vector(mlp_net.parameters())
print(f'Initial parameters:\n{params_flat.detach()}\n\n')

# Add offset to parameters and update them
offset_flat = torch.ones_like(params_flat)
nn.utils.vector_to_parameters(params_flat + offset_flat, mlp_net.parameters())
params_flat = nn.utils.parameters_to_vector(mlp_net.parameters())
print(f'Parameters after update:\n{params_flat.detach()}\n\n')

Initial parameters:
tensor([-0.2241, -0.4701, -0.0195,  0.2474,  0.2873, -0.3175,  0.4951,  0.1695,
        -0.0775,  0.5563, -0.3664,  0.1876,  0.5165, -0.0506, -0.1947,  0.0188,
        -0.2806, -0.5027, -0.1864, -0.2142, -0.3837,  0.2707,  0.3780, -0.3621,
        -0.1718])


Parameters after update:
tensor([0.7759, 0.5299, 0.9805, 1.2474, 1.2873, 0.6825, 1.4951, 1.1695, 0.9225,
        1.5563, 0.6336, 1.1876, 1.5165, 0.9494, 0.8053, 1.0188, 0.7194, 0.4973,
        0.8136, 0.7858, 0.6163, 1.2707, 1.3780, 0.6379, 0.8282])




In [6]:
### Define parameters needed for training

env_fn = lambda : gym.make('CartPole-v1')

# AC module parameters
hidden_sizes_actor, hidden_sizes_critic = [64, 64], [64, 64]
hidden_acts_actor, hidden_acts_critic = nn.Tanh, nn.Tanh
ac_kwargs = dict(hidden_sizes_actor=hidden_sizes_actor, 
                 hidden_sizes_critic=hidden_sizes_critic,
                 hidden_acts_actor=hidden_acts_actor, 
                 hidden_acts_critic=hidden_acts_critic)

# Training parameters
seed = 0
steps_per_epoch, max_ep_len, epochs = 4000, 1000, 50 
gamma, lam = 0.99, 0.97 
delta, surr_obj_min = 0.01, 0.003
vf_lr, train_v_iters = 1e-3, 80
damping_coeff, cg_iters = 0.1, 10
backtrack_iters, backtrack_coeff = 10, 0.8 
save_freq = 10

# Experiment parameters
data_dir = '/home/sherif/user/python/DRL/data/trpo'
exp_name_spinup = 'trpo_spinup_discrete'
exp_name_custom = 'trpo_custom_discrete'

In [7]:
### Perform training experiment with the Spinup TRPO implementation

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_spinup, data_dir=data_dir)

trpo(env_fn, ac_kwargs=dict(hidden_sizes=hidden_sizes_actor), seed=seed,
     steps_per_epoch=steps_per_epoch, epochs=epochs, gamma=gamma, delta=delta,
     vf_lr=vf_lr, train_v_iters=train_v_iters, damping_coeff=damping_coeff,
     cg_iters=cg_iters, backtrack_iters=backtrack_iters, 
     backtrack_coeff=backtrack_coeff, lam=lam, max_ep_len=max_ep_len,
     logger_kwargs=logger_kwargs, save_freq=save_freq)

[32;1mLogging data to /home/sherif/user/python/DRL/data/trpo/trpo_spinup_discrete/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "actor_critic":	"mlp_actor_critic",
    "algo":	"trpo",
    "backtrack_coeff":	0.8,
    "backtrack_iters":	10,
    "cg_iters":	10,
    "damping_coeff":	0.1,
    "delta":	0.01,
    "env_fn":	"<function <lambda> at 0x748f1a9ae730>",
    "epochs":	50,
    "exp_name":	"trpo_spinup_discrete",
    "gamma":	0.99,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x748f1a99e198>":	{
            "epoch_dict":	{},
            "exp_name":	"trpo_spinup_discrete",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"/home/sherif/user/python/DRL/data/trpo/trpo_spinup_discrete",
            "output_file":	{
                "<_io.TextIOWrapper name='/home/sherif/user/python/DRL



[32;1mAccepting new params at step 0 of line search.[0m
---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |            26.1 |
|          StdEpRet |            15.1 |
|          MaxEpRet |              91 |
|          MinEpRet |              10 |
|             EpLen |            26.1 |
|      AverageVVals |         -0.0227 |
|          StdVVals |           0.446 |
|          MaxVVals |            1.51 |
|          MinVVals |            -1.4 |
| TotalEnvInteracts |           4e+03 |
|            LossPi |        9.78e-09 |
|             LossV |             375 |
|       DeltaLossPi |         -0.0223 |
|        DeltaLossV |            -214 |
|                KL |         0.00906 |
|    BacktrackIters |               0 |
|              Time |            1.23 |
---------------------------------------
[32;1mAccepting new params at step 0 of line search.[0m
---------------------------------------
|             Epoch |               1 |
|   

In [7]:
### Start the training process using the custom TRPO implementation

logger_kwargs = setup_logger_kwargs(exp_name=exp_name_custom, data_dir=data_dir)

trainer = TRPOTrainer()
trainer.train_mod(env_fn, ac=MLPActorCritic, ac_kwargs=ac_kwargs, seed=seed,
                  steps_per_epoch=steps_per_epoch, epochs=epochs, gamma=gamma, 
                  delta=delta, surr_obj_min=surr_obj_min, vf_lr=vf_lr, 
                  train_v_iters=train_v_iters, damping_coeff=damping_coeff, 
                  cg_iters=cg_iters, backtrack_iters=backtrack_iters, 
                  backtrack_coeff=backtrack_coeff, lam=lam, max_ep_len=max_ep_len, 
                  logger_kwargs=logger_kwargs, save_freq=save_freq)



[32;1mLogging data to /home/sherif/user/python/DRL/data/trpo/trpo_custom_discrete/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac":	"MLPActorCritic",
    "ac_kwargs":	{
        "hidden_acts_actor":	"Tanh",
        "hidden_acts_critic":	"Tanh",
        "hidden_sizes_actor":	[
            64,
            64
        ],
        "hidden_sizes_critic":	[
            64,
            64
        ]
    },
    "ac_mod":	{
        "MLPActorCritic(\n  (actor): MLPActorDiscrete(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): Tanh()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n      (activation_2): Tanh()\n      (output): Linear(in_features=64, out_features=2, bias=True)\n    )\n  )\n  (critic): MLPCritic(\n    (net): Sequential(\n      (hidden_1): Linear(in_features=4, out_features=64, bias=True)\n      (activation_1): Tanh()\n      (hidden_2): Linear(in_features=64, out_features=64, bias=True)\n   