In [1]:
import gym
import random, math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)

import base64, io

# For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob
from reinforce_rwd2go import reinforce_rwd2go, rollout, make_pref_dataset
from utils import pref_save, pref_load

from model import Policy
import pickle

%load_ext autoreload
%autoreload 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
SEED=0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action space: Discrete(2)


  logger.warn(
  deprecation(
  deprecation(


In [15]:
policy1 = Policy().to(device)
opt1    = optim.Adam(policy1.parameters(), lr=1e-2)

print("=== Training π₁ ===")
scores1 = reinforce_rwd2go(env, policy1, opt1, env,
                            n_episodes=2000, print_every=100,
                            target_reward=None, near_max_reward=195)
avg1 = np.mean(scores1[-100:])
print(f"Final π₁ avg100: {avg1:.2f}\n")

=== Training π₁ ===
Ep 100	avg100: 47.94
Ep 200	avg100: 169.07
Ep 300	avg100: 179.38
Ep 400	avg100: 190.14
Solved 195 at ep 426 (avg=195.0)
Final π₁ avg100: 195.04



In [16]:
policy2 = Policy().to(device)
opt2    = optim.Adam(policy2.parameters(), lr=1e-2)
half_target = avg1 / 2.0

print("=== Training π₂ ===")
scores2 = reinforce_rwd2go(env, policy2, opt2, env,
                            n_episodes=2000, print_every=100,
                            target_reward=half_target, near_max_reward=195)
avg2 = np.mean(scores2[-100:])
print(f"Final π₂ avg100: {avg2:.2f}\n")

=== Training π₂ ===
Ep 100	avg100: 39.44
Reached target 97.5 at ep 104 (avg=41.2)
Final π₂ avg100: 41.22



In [18]:
K = 100
print(f"Generating {K} preference pairs …")
pref_data = make_pref_dataset(policy1, policy2, env, K)
pref_save(pref_data, f"pref_data_{K}.pickle")
print("Finished")

torch.save(policy1.state_dict(), f"policy1_{K}.pth")
torch.save(policy2.state_dict(), f"policy2_{K}.pth")

Generating 100 preference pairs …
Mean p1 199.23
Mean p2 81.87
Finished
