In [59]:
import gym
import random, math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)

import base64, io

# For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob
from reinforce_rwd2go import reinforce_rwd2go, rollout, make_pref_dataset
from utils import pref_save, pref_load

from model import getPolicy
import pickle

%load_ext autoreload
%autoreload 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


device(type='cpu')

In [None]:
SEED=0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

ENV_NAME = 'Pendulum-v1'

env = gym.make(ENV_NAME)

K = 1000

print('observation space size:', env.observation_space.shape)
print('action space size:', env.action_space.n)

observation space size: (4,)
action space size: 2


In [61]:
policy1 = getPolicy(ENV_NAME=ENV_NAME).to(device)
opt1    = optim.Adam(policy1.parameters(), lr=1e-2)

print("=== Training π₁ ===")
scores1 = reinforce_rwd2go(env, policy1, opt1, env,
                            n_episodes=2000, print_every=100,
                            target_reward=None, near_max_reward=195, gamma=0.95)
avg1 = np.mean(scores1[-100:])
print(f"Final π₁ avg100: {avg1:.2f}\n")

=== Training π₁ ===


  if not isinstance(terminated, (bool, np.bool8)):


Ep 100	avg100: 45.39
Ep 200	avg100: 125.62
Ep 300	avg100: 94.05
Solved 195 at ep 397 (avg=195.3)
Final π₁ avg100: 195.31



In [62]:
policy2 = getPolicy(ENV_NAME=ENV_NAME).to(device)
opt2    = optim.Adam(policy2.parameters(), lr=1e-2)
half_target = avg1 / 2.0

print("=== Training π₂ ===")
scores2 = reinforce_rwd2go(env, policy2, opt2, env,
                            n_episodes=2000, print_every=100,
                            target_reward=half_target, near_max_reward=195)
avg2 = np.mean(scores2[-100:])
print(f"Final π₂ avg100: {avg2:.2f}\n")

=== Training π₂ ===
Ep 100	avg100: 45.97
Reached target 97.7 at ep 104 (avg=48.1)
Final π₂ avg100: 48.11



In [63]:

print(f"Generating {K} preference pairs …")
pref_data = make_pref_dataset(policy1, policy2, env, K)
pref_save(pref_data, f"pref_data_{K}_{ENV_NAME}.pickle")
print("Finished")

torch.save(policy1.state_dict(), f"policy1_{K}_{ENV_NAME}.pth")
torch.save(policy2.state_dict(), f"policy2_{K}_{ENV_NAME}.pth")

Generating 1000 preference pairs …
Mean p1 199.793
Mean p2 72.225
Finished
