In [3]:
import gym
import random, math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)

import base64, io

# For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob
from scripts.reinforce_PPORLHF import reinforce_rwd2go_PPO_RLHF
from scripts.utils import pref_save, pref_load

from scripts.model import getPolicy
import pickle

%load_ext autoreload
%autoreload 2
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


device(type='cpu')

In [None]:
SEED=0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

ENV_NAME = 'Pendulum-v1'

env = gym.make(ENV_NAME)

K = 1000

observation space size: (4,)
action space size: 2


  logger.warn(
  deprecation(
  deprecation(


In [25]:
policy1 = getPolicy(ENV_NAME=ENV_NAME).to(device)
policy1.load_state_dict(torch.load(f"policy1_{K}_{ENV_NAME}.pth"))
policy2 = getPolicy(ENV_NAME=ENV_NAME).to(device)
policy2.load_state_dict(torch.load(f"policy2_{K}_{ENV_NAME}.pth"))

pref_data = pref_load(f"pref_data_{K}_{ENV_NAME}.pickle")

In [26]:
class RewardModel(nn.Module):
    def __init__(self, state_size=4, action_size=1, hidden_size=32):
        super(RewardModel, self).__init__()
        self.fc1 = nn.Linear(state_size + action_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return x

    def predict_reward(self, state, action):
        state = state
        action = torch.tensor(action).reshape(1,1)  
        
        state_action = torch.cat((state, action), dim=1)
        reward = self.forward(state_action).cpu()
        return reward

In [27]:
import torch
import torch.nn.functional as F
import copy

lr        = 3e-3
epochs    = 10

reward_model = RewardModel(state_size=4, action_size=1)

optimizer = torch.optim.Adam(reward_model.parameters(), lr=lr)

def trajectory_reward(reward_model, states, actions):
    total_reward = torch.tensor(0., device=device)
    for s, a in zip(states, actions):
        s_t = torch.tensor(s, dtype=torch.float32, device=device)
        reward = reward_model.predict_reward(s_t.unsqueeze(0), a).squeeze(0)
        total_reward += reward.squeeze(0)
    return total_reward

for epoch in range(1, epochs+1):
    total_loss = 0.0
    
    for s0, tau_plus, tau_minus in pref_data:

        reward_plus = trajectory_reward(reward_model, tau_plus["states"], tau_plus["actions"])
        reward_minus = trajectory_reward(reward_model, tau_minus["states"], tau_minus["actions"])
        total_loss += - torch.log(torch.exp(reward_plus) / (torch.exp(reward_minus) + torch.exp(reward_plus)))

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    
    avg_loss = total_loss / len(pref_data)
    
    print(f"Epoch {epoch}/{epochs} — avg loss: {avg_loss:.4f}")

Epoch 1/10 — avg loss: 11.9466
Epoch 2/10 — avg loss: 8.5590
Epoch 3/10 — avg loss: 5.3510
Epoch 4/10 — avg loss: 2.8022
Epoch 5/10 — avg loss: 1.2287
Epoch 6/10 — avg loss: 0.4961
Epoch 7/10 — avg loss: 0.2259
Epoch 8/10 — avg loss: 0.1191
Epoch 9/10 — avg loss: 0.0682
Epoch 10/10 — avg loss: 0.0396


In [28]:
policy2 = getPolicy(ENV_NAME=ENV_NAME).to(device)
policy2.load_state_dict(torch.load(f"policy2_{K}_{ENV_NAME}.pth"))
opt1    = optim.Adam(policy2.parameters(), lr=1e-3)
reward_model.eval()
reinforce_rwd2go_PPO_RLHF(env, policy2, opt1, reward_model, n_episodes=2000);

  if not isinstance(terminated, (bool, np.bool8)):


Ep 100	avg100: 71.72
Ep 200	avg100: 75.56
Ep 300	avg100: 80.45
Ep 400	avg100: 81.71
Ep 500	avg100: 89.27
Ep 600	avg100: 124.07
Ep 700	avg100: 152.19
Ep 800	avg100: 172.60
Ep 900	avg100: 169.34
Ep 1000	avg100: 176.95
Ep 1100	avg100: 178.01
Ep 1200	avg100: 160.78
Ep 1300	avg100: 189.67
Ep 1400	avg100: 183.10
Ep 1500	avg100: 175.11
Ep 1600	avg100: 179.82
Ep 1700	avg100: 175.63
Ep 1800	avg100: 186.14
Ep 1900	avg100: 188.72


In [29]:
returns = []
eval_episodes = 100
for ep in range(eval_episodes):
    state, done, total_r = env.reset(), False, 0.0
    while not done:
        # choose greedy or stochastic—here greedy
        with torch.no_grad():
            s_t = torch.tensor(state, dtype=torch.float32, device=device)
            probs = policy2(s_t.unsqueeze(0)).squeeze(0)
            action = torch.argmax(probs).item()
        state, r, done, _ = env.step(action)
        total_r += r
    returns.append(total_r)

mean_return = sum(returns) / len(returns)
print(f"Evaluation over {eval_episodes} episodes: mean return = {mean_return:.2f}")

Evaluation over 100 episodes: mean return = 200.00
