### 0. Set-up

In [1]:
# imports
# bridge env (Gabriel's)
import copy
import time
import os
import wandb
import numpy as np
import pickle
from discrete_blocks import discrete_block as Block
from relative_single_agent import SACSupervisorSparse,generous_reward,punitive_reward,modular_reward
from discrete_simulator import DiscreteSimulator as Sim, Transition
import discrete_graphics as gr

# # rlhf
# import random
# from imitation.algorithms import preference_comparisons
# from imitation.rewards.reward_nets import BasicRewardNet
# from imitation.util.networks import RunningNorm
# from imitation.util.util import make_vec_env
# from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
# import gymnasium as gym
# from stable_baselines3 import PPO
# import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# constants
USE_WANDB = False
%env "WANDB_NOTEBOOK_NAME" "rlhf_main.ipynb"

env: "WANDB_NOTEBOOK_NAME"="rlhf_main.ipynb"


In [3]:
# blocks
hexagon = Block([[1,0,0],[1,1,1],[1,1,0],[0,2,1],[0,1,0],[0,1,1]],muc=0.5)
linkr = Block([[0,0,0],[0,1,1],[1,0,0],[1,0,1],[1,1,1],[0,1,0]],muc=0.5) 
linkl = Block([[0,0,0],[0,1,1],[1,0,0],[0,1,0],[0,0,1],[-1,1,1]],muc=0.5) 
linkh = Block([[0,0,0],[0,1,1],[1,0,0],[-1,2,1],[0,1,0],[0,2,1]],muc=0.5)
#target = Block([[0,0,1],[1,0,1]])
target = Block([[0,0,1]])

In [4]:
# config
config = {'train_n_episodes':100,
            'train_l_buffer':200,
            'ep_batch_size':32,
            'ep_use_mask':True,
            'agent_discount_f':0.1, # 1-gamma
            'agent_last_only':True,
            'reward': 'modular',
            'torch_device':'cpu',
            'SEnc_n_channels':64,
            'SEnc_n_internal_layer':2,
            'SEnc_stride':1,
            'SEnc_order_insensitive':True,
            'SAC_n_fc_layer':3,
            'SAC_n_neurons':128,
            'SAC_batch_norm':True,
            'Q_duel':True,
            'opt_lr':1e-4,
            'opt_pol_over_val': 1,
            'opt_tau': 5e-4,
            'opt_weight_decay':0.0001,
            'opt_exploration_factor':0.001,
            'agent_exp_strat':'softmax',
            'agent_epsilon':0.05, # not needed in sac
            'opt_max_norm': 2,
            'opt_target_entropy':1.8,
            'opt_value_clip':False,
            'opt_entropy_penalty':False,
            'opt_Q_reduction': 'min',
            'V_optimistic':False,
            'reward_failure':-1,
            'reward_action':{'Ph': -0.2, 'L':-0.1},
            'reward_closer':0.4,
            'reward_nsides': 0.1,
            'reward_success':1,
            'reward_opposite_sides':0,
            'opt_lower_bound_Vt':-2,
            'gap_range':[2,6]
            }

In [5]:
# Create Gym (env + agent)
from single_agent_gym import ReplayDiscreteGymSupervisor

gym = ReplayDiscreteGymSupervisor(config,
              agent_type=SACSupervisorSparse,
              use_wandb=USE_WANDB,
              actions= ['Ph'], # place-hold only necessary action
              block_type=[hexagon],
              random_targets='random_gap', 
              targets_loc=[[2,0],[6,0]], 
              n_robots=2, 
              max_blocks = 10,
              targets=[target]*2,
              max_interfaces = 50,
              log_freq = 5,
              maxs = [9,6]) # grid size

In [6]:
# Create Reward Model
from rlhf_reward_model import RewardLinear

reward_model = RewardLinear()

In [7]:
# Create Fragmenter
from rlhf_fragmenter import RandomFragmenter

fragmenter = RandomFragmenter()

In [9]:
# Create Preference Gatherer (human/synthetic)
from rlhf_preference_gatherer import SyntheticPreferenceGatherer

gatherer = SyntheticPreferenceGatherer(config)

In [10]:
# Create Preference Model
from rlhf_preference_model import PreferenceModel

preference_model = PreferenceModel(reward_model)

ImportError: cannot import name 'Transitions' from 'typing' (/home/sabri/Anaconda3/envs/sycamore/lib/python3.9/typing.py)

In [None]:
# Create Reward Trainer
from rlhf_reward_trainer import LinearRewardTrainer

reward_trainer = LinearRewardTrainer(preference_model)

In [None]:
# Create Preference Comparisons, the main interface
from rlhf_preference_comparisons import PreferenceComparisons

pref_comparisons = PreferenceComparisons(
    gym,
    reward_model,
    num_iterations=5,  # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    initial_epoch_multiplier=4,
    query_schedule="hyperbolic",
)

In [1]:
# Original code
rng = np.random.default_rng(0)

venv = make_vec_env("Pendulum-v1", rng=rng)

reward_net = BasicRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
)

fragmenter = preference_comparisons.RandomFragmenter(
    warning_threshold=0,
    rng=rng,
)
gatherer = preference_comparisons.SyntheticGatherer(rng=rng)
preference_model = preference_comparisons.PreferenceModel(reward_net)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    preference_model=preference_model,
    loss=preference_comparisons.CrossEntropyRewardLoss(),
    epochs=3,
    rng=rng,
)


# Several hyperparameters (reward_epochs, ppo_clip_range, ppo_ent_coef,
# ppo_gae_lambda, ppo_n_epochs, discount_factor, use_sde, sde_sample_freq,
# ppo_lr, exploration_frac, num_iterations, initial_comparison_frac,
# initial_epoch_multiplier, query_schedule) used in this example have been
# approximately fine-tuned to reach a reasonable level of performance.
agent = PPO(
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=2048 // venv.num_envs,
    batch_size=64,
    ent_coef=0.01,
    learning_rate=2e-3,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    venv=venv,
    exploration_frac=0.05,
    rng=rng,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5,  # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=100,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    initial_epoch_multiplier=4,
    query_schedule="hyperbolic",
)

NameError: name 'np' is not defined

### 1. Reward Model Training

In [2]:
pref_comparisons.train(
    total_timesteps=5_000,
    total_comparisons=200,
)

Query schedule: [20, 51, 41, 34, 29, 25]
Collecting 40 fragments (4000 transitions)
Requested 3800 transitions but only 0 in buffer. Sampling 3800 additional transitions.
Sampling 200 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model: 100%|██████████| 12/12 [00:00<00:00, 17.19it/s]

Training agent for 1000 timesteps





---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 200      |
|    agent/rollout/ep_rew_mean         | -1.2e+03 |
|    agent/rollout/ep_rew_wrapped_mean | 29.5     |
|    agent/time/fps                    | 3775     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
------------------------------------------------------
| mean/                                   |          |
|    agent/rollout/ep_len_mean            | 200      |
|    agent/rollout/ep_rew_mean            | -1.2e+03 |
|    agent/rollout/ep_rew_wrapped_mean    | 29.5     |
|    agent/time/fps                       | 3.78e+03 |
|    agent/time/iterations                | 1        |
|    agent/time/time_elapsed              | 0        |
|    agent/time/total_timesteps         

Training reward model: 100%|██████████| 3/3 [00:00<00:00,  3.50it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.14e+03    |
|    agent/rollout/ep_rew_wrapped_mean | 33.5         |
|    agent/time/fps                    | 2438         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0011954873 |
|    agent/train/clip_fraction         | 0.0603       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.43        |
|    agent/train/explained_variance    | -0.284       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0554       |
|    agent/train/n_updates             | 10           |
|    agent/train/policy_gradient_loss  | -0.0014

Training reward model: 100%|██████████| 3/3 [00:00<00:00,  3.63it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.17e+03    |
|    agent/rollout/ep_rew_wrapped_mean | 35           |
|    agent/time/fps                    | 3588         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0018702098 |
|    agent/train/clip_fraction         | 0.0724       |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.43        |
|    agent/train/explained_variance    | 0.67         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.037        |
|    agent/train/n_updates             | 20           |
|    agent/train/policy_gradient_loss  | -0.0030

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  2.62it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.19e+03    |
|    agent/rollout/ep_rew_wrapped_mean | 34.4         |
|    agent/time/fps                    | 2599         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0021171272 |
|    agent/train/clip_fraction         | 0.115        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.43        |
|    agent/train/explained_variance    | 0.882        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00993      |
|    agent/train/n_updates             | 30           |
|    agent/train/policy_gradient_loss  | -0.0049

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  2.38it/s]

Training agent for 1000 timesteps





-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 200          |
|    agent/rollout/ep_rew_mean         | -1.21e+03    |
|    agent/rollout/ep_rew_wrapped_mean | 33.6         |
|    agent/time/fps                    | 3353         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 0            |
|    agent/time/total_timesteps        | 10240        |
|    agent/train/approx_kl             | 0.0025484716 |
|    agent/train/clip_fraction         | 0.151        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -1.42        |
|    agent/train/explained_variance    | 0.931        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.00995      |
|    agent/train/n_updates             | 40           |
|    agent/train/policy_gradient_loss  | -0.0081

Training reward model: 100%|██████████| 3/3 [00:01<00:00,  2.00it/s]

Training agent for 1000 timesteps





-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 200        |
|    agent/rollout/ep_rew_mean         | -1.2e+03   |
|    agent/rollout/ep_rew_wrapped_mean | 32.3       |
|    agent/time/fps                    | 4051       |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 0          |
|    agent/time/total_timesteps        | 12288      |
|    agent/train/approx_kl             | 0.00394712 |
|    agent/train/clip_fraction         | 0.208      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -1.43      |
|    agent/train/explained_variance    | 0.94       |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | 0.00143    |
|    agent/train/n_updates             | 50         |
|    agent/train/policy_gradient_loss  | -0.0119    |
|    agent/train/std        

{'reward_loss': 0.1524055983339037, 'reward_accuracy': 0.9241071428571429}

In [3]:
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict_processed)

### 2. Agent Training on Learned Reward

In [4]:
learner = PPO(
    seed=0,
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=learned_reward_venv,
    batch_size=64,
    ent_coef=0.01,
    n_epochs=10,
    n_steps=2048 // learned_reward_venv.num_envs,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    learning_rate=2e-3,
)
learner.learn(1_000)  # Note: set to 100_000 to train a proficient expert

<stable_baselines3.ppo.ppo.PPO at 0x7f59b0460f10>

### 3. Evaluate Agent

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy

n_eval_episodes = 10
reward_mean, reward_std = evaluate_policy(learner.policy, venv, n_eval_episodes)
reward_stderr = reward_std / np.sqrt(n_eval_episodes)
print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

Reward: -1339 +/- 117
