In [None]:
import gym
import random, math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

import torch
import torch.optim as optim
torch.manual_seed(0)

from scripts.reinforce_rwd2go import make_pref_dataset
from scripts.utils import pref_save

from scripts.model import getPolicy
from scripts.reinforce_rwd2go import reinforce_rwd2go_2

%load_ext autoreload
%autoreload 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
SEED=0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)

K = 10

In [None]:
policy = getPolicy(ENV_NAME=ENV_NAME).to(device)
opt = optim.Adam(policy.parameters(), lr=1e-2)

print("=== Training π₁ to get π₁ and π₂ ===")
scores, step_policy2_point, step_policy2_mean, step_policy2_both, step_policy1 = reinforce_rwd2go_2(env, policy, opt, n_episodes=2000, print_every=100, near_max_reward=195)

In [None]:
import matplotlib.pyplot as plt

def plot_rewards(scores, policy1_step=None, policy2_step_mean=None, policy2_step_point=None, policy2_step_both=None):
    episodes = list(range(1, len(scores) + 1))

    plt.figure()
    plt.plot(episodes, scores)

    if policy1_step is not None:
        plt.axvline(policy1_step, color='red', linestyle='--', label="Policy 1 saved")
    if policy2_step_mean is not None:
        plt.axvline(policy2_step_mean, color='green', linestyle='--', label="Policy 2 with mean saved")
    if policy2_step_point is not None:
        plt.axvline(policy2_step_point, color='blue', linestyle='--', label="Policy 2 with current saved")
    if policy2_step_both is not None:
        plt.axvline(policy2_step_both, color='orange', linestyle='--', label="Policy 2 with both saved")
    
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Rewards evolution per episode")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"PoliciesGeneration_{ENV_NAME}.jpg")
    plt.show()

plot_rewards(scores, policy1_step=step_policy1, policy2_step_mean=step_policy2_mean, policy2_step_point=step_policy2_point, policy2_step_both=step_policy2_both)

In [None]:
POLICY1_NAME = "policies/policy1.pth"
POLICY2_NAME = "policies/policy2_with_both.pth"

In [None]:
policy1 = getPolicy(ENV_NAME=ENV_NAME).to(device)
policy1.load_state_dict(torch.load(POLICY1_NAME))

policy2 = getPolicy(ENV_NAME=ENV_NAME).to(device)
policy2.load_state_dict(torch.load(POLICY2_NAME))

print(f"Generating {K} preference pairs …")
pref_data = make_pref_dataset(policy1, policy2, env, K)
pref_save(pref_data, f"pref_data/pref_data_{K}_{ENV_NAME}.pickle")
print("Finished")