In [1]:
%matplotlib inline
%config InlineBackend.figure_format = {'svg', 'png'}[0]

In [2]:
import os, copy, time, pickle, uuid
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator

import torch
from torch.utils.tensorboard import SummaryWriter

import gymnasium as gym

from reinforce_agent import ReinforceAgent
from surrogate import *

np.set_printoptions(precision=3, suppress=True, floatmode='fixed', linewidth=150)  # useful for printing

## CartPole Demo

In [None]:
# hyper-params
env_name = 'CartPole-v1'
hidden_dims = [256, 256]
gamma = 0.99
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 1e4
model_save_interval = 100
return_threshold = 500

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = 0, 0, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)
        next_obs, rew, ter, tru, _ = env.step(act)
        
        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew
        
        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(episode_rew)
    average_return = np.array(returns).mean() if len(returns) <= 50 else np.array(returns[-51:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")
    
    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 50 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')
    
    if len(returns) > 50 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

## Position Control Task

In [None]:
# hyper-params
env_name = 'PositionControl-v0'
hidden_dims = [256, 256]
gamma = 0.99
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 1e4
model_save_interval = 100
return_threshold = 0.99

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = 0, 0, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)
        next_obs, rew, ter, tru, _ = env.step(act)
        
        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew
        
        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(episode_rew)
    average_return = np.array(returns).mean() if len(returns) <= 50 else np.array(returns[-51:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")
    
    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 50 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')

    if len(returns) > 50 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

## Path Following Task

In [None]:
# hyper-params
env_name = ['PathFollowingLine-v0', 'PathFollowingEight-v0'][1]
hidden_dims = [256, 256]
gamma = 0.9
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 1e4
model_save_interval = 100
return_threshold = 95

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = 0, 0, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)
        next_obs, rew, ter, tru, _ = env.step(act)
        
        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew
        
        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(episode_rew)
    average_return = np.array(returns).mean() if len(returns) <= 50 else np.array(returns[-51:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")
    
    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 50 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')

    if len(returns) > 50 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

## Pose Regulation (Easy-Level) Task

In [8]:
# hyper-params
env_name = 'PoseRegulationEasy-v0'
hidden_dims = [32]
gamma = 0.99
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 5e4
model_save_interval = 100
return_threshold = 0.95

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = 0, 0, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)  # if episodes > 100 else env.action_space.sample()
        next_obs, rew, ter, tru, _ = env.step(act)

        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew

        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(1 if episode_rew >= 0 else 0)
    average_return = np.array(returns).mean() if len(returns) <= 100 else np.array(returns[-101:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")

    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 100 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')

    if len(returns) > 100 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

c67b1ae8 | Steps: 30 | Episodes: 1 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 60 | Episodes: 2 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 90 | Episodes: 3 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 120 | Episodes: 4 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 150 | Episodes: 5 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 180 | Episodes: 6 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 210 | Episodes: 7 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 240 | Episodes: 8 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 270 | Episodes: 9 | Episode Length: 30 | Episode Reward: -0.03 | Average Return: 0.0
c67b1ae8 | Steps: 300 | Episodes: 10 | Episode Length: 30 | Episode Reward: 

KeyboardInterrupt: 

## Pose Regulation (Middle-Level) Task

In [3]:
# inherited from easy-level
chkpts_dir = './checkpoints/c67b1ae8/18400/'
chkpts_episodes = 18400
chkpts_steps = 513286

# hyper-params
env_name = 'PoseRegulationMiddle-v0'
hidden_dims = [32]
gamma = 0.99
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 5e4 + chkpts_steps
model_save_interval = 100
return_threshold = 0.95

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')
agent.load_model(chkpts_dir)  # agent.pi_net.load_state_dict(torch.load(os.path.join(chkpts_dir, 'pi.pth')))

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = chkpts_steps, chkpts_episodes, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)  # if episodes > 500 else env.action_space.sample()
        next_obs, rew, ter, tru, _ = env.step(act)

        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew

        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(1 if episode_rew >= 0 else 0)
    average_return = np.array(returns).mean() if len(returns) <= 100 else np.array(returns[-101:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")

    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 100 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')

    if len(returns) > 100 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

d4aa9738 | Steps: 513311 | Episodes: 18401 | Episode Length: 25 | Episode Reward: 0.976 | Average Return: 1.0
d4aa9738 | Steps: 513336 | Episodes: 18402 | Episode Length: 25 | Episode Reward: 0.976 | Average Return: 1.0
d4aa9738 | Steps: 513396 | Episodes: 18403 | Episode Length: 60 | Episode Reward: -0.06 | Average Return: 0.66667
d4aa9738 | Steps: 513456 | Episodes: 18404 | Episode Length: 60 | Episode Reward: -0.06 | Average Return: 0.5
d4aa9738 | Steps: 513516 | Episodes: 18405 | Episode Length: 60 | Episode Reward: -0.06 | Average Return: 0.4
d4aa9738 | Steps: 513576 | Episodes: 18406 | Episode Length: 60 | Episode Reward: -0.06 | Average Return: 0.33333
d4aa9738 | Steps: 513636 | Episodes: 18407 | Episode Length: 60 | Episode Reward: -0.06 | Average Return: 0.28571
d4aa9738 | Steps: 513663 | Episodes: 18408 | Episode Length: 27 | Episode Reward: 0.974 | Average Return: 0.375
d4aa9738 | Steps: 513694 | Episodes: 18409 | Episode Length: 31 | Episode Reward: 0.97 | Average Return: 0

KeyboardInterrupt: 

## Pose Regulation (Hard-Level) Task

In [5]:
# inherited from middle-level
chkpts_dir = './checkpoints/d4aa9738/28200/'
chkpts_episodes = 28200
chkpts_steps = 942985

# hyper-params
env_name = 'PoseRegulationHard-v0'
hidden_dims = [32]
gamma = 0.99
v_lr = 1e-2
pi_lr = 1e-3
max_episodes = 5e4 + chkpts_steps
model_save_interval = 100
return_threshold = 0.95

# env
env = gym.make(env_name)

# agent
assert isinstance(env.action_space, gym.spaces.Discrete), "Only support DISCRETE action space yet."
agent = ReinforceAgent(env.observation_space.shape[0], env.action_space.n, hidden_dims, gamma, v_lr, pi_lr, with_baseline=True, device='cuda')
agent.load_model(chkpts_dir)  # agent.pi_net.load_state_dict(torch.load(os.path.join(chkpts_dir, 'pi.pth')))

# logger
uid = str(uuid.uuid1()).split('-')[0]
logger = SummaryWriter(log_dir=f"./tensorboard/{env_name}/{agent.name}/{uid}")

# training
steps, episodes, returns = chkpts_steps, chkpts_episodes, []
while episodes <= max_episodes:
    buffer = []
    episode_len = 0
    episode_rew = 0

    obs, _ = env.reset()
    while True:  # one rollout
        act = agent.get_action(obs, deterministic=False)  # if episodes > 500 else env.action_space.sample()
        next_obs, rew, ter, tru, _ = env.step(act)

        buffer.append((obs, act, rew, next_obs, ter, tru))
        obs = next_obs

        steps += 1
        episode_len += 1
        episode_rew += rew

        if ter or tru:
            break

    episodes += 1

    loss_log = agent.update(map(np.stack, zip(*buffer)))

    returns.append(1 if episode_rew >= 0 else 0)
    average_return = np.array(returns).mean() if len(returns) <= 100 else np.array(returns[-101:-1]).mean()

    # verbose
    print(f"{uid} | Steps: {steps} | Episodes: {episodes} | Episode Length: {episode_len} | Episode Reward: {round(episode_rew, 5)} | Average Return: {round(average_return, 5)}")

    # logging
    logger.add_scalar('episodic/return', episode_rew, episodes)
    logger.add_scalar('episodic/length', episode_len, episodes)
    logger.add_scalar('episodic/return(average)', average_return, episodes)
    if loss_log is not None:
        for key, value in loss_log.items():
            logger.add_scalar(key, value, episodes)

    # save model
    if episodes % model_save_interval == 0 or (len(returns) > 100 and average_return >= return_threshold):
        agent.save_model(f'./checkpoints/{uid}/{episodes}/')

    if len(returns) > 100 and average_return >= return_threshold:
        print(f"Training SUCCESSFUL!")
        break

5bb24efe | Steps: 943105 | Episodes: 28201 | Episode Length: 120 | Episode Reward: -0.12 | Average Return: 0.0
5bb24efe | Steps: 943119 | Episodes: 28202 | Episode Length: 14 | Episode Reward: 0.987 | Average Return: 0.5
5bb24efe | Steps: 943120 | Episodes: 28203 | Episode Length: 1 | Episode Reward: 1 | Average Return: 0.66667
5bb24efe | Steps: 943201 | Episodes: 28204 | Episode Length: 81 | Episode Reward: 0.92 | Average Return: 0.75
5bb24efe | Steps: 943321 | Episodes: 28205 | Episode Length: 120 | Episode Reward: -0.12 | Average Return: 0.6
5bb24efe | Steps: 943441 | Episodes: 28206 | Episode Length: 120 | Episode Reward: -0.12 | Average Return: 0.5
5bb24efe | Steps: 943486 | Episodes: 28207 | Episode Length: 45 | Episode Reward: 0.956 | Average Return: 0.57143
5bb24efe | Steps: 943606 | Episodes: 28208 | Episode Length: 120 | Episode Reward: -0.12 | Average Return: 0.5
5bb24efe | Steps: 943726 | Episodes: 28209 | Episode Length: 120 | Episode Reward: -0.12 | Average Return: 0.4444

KeyboardInterrupt: 