In [None]:
from email import policy
import gym
from platformdirs import user_desktop_dir
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback


def main():
    env = make_vec_env('MountainCarContinuous-v0', n_envs=1)

    model = PPO(
        policy='MlpPolicy',
        env=env,
        n_steps=8,
        gae_lambda=0.9,
        gamma=0.9999,
        batch_size= 256,
        n_epochs=10,
        ent_coef=0.00429,
        clip_range= 0.1,
        max_grad_norm= 5,
        vf_coef= 0.19,
        use_sde= True
    )

    # save a checkpoint every 10k steps
    checkpoint_callback = CheckpointCallback(
        save_freq=10000,
        save_path='./policies/',
        name_prefix='ppo_mountain_ctn'
    )

    eval_env = gym.make('MountainCarContinuous-v0', new_step_api=True)
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path='./best_model/',
        log_path='./eval_logs/',
        eval_freq=5000,
        deterministic=True,
        render=False
    )

    total_timesteps = 20_000
    model.learn(
        total_timesteps=total_timesteps,
        callback=[checkpoint_callback, eval_callback]
    )

    model.save('./policies/ppo_mountain_ctn_final')

    return checkpoint_callback, eval_callback, model, env


if __name__ == '__main__':
    checkpoint_cb, eval_cb, trained_model, env = main()


Eval num_timesteps=5000, episode_reward=77.62 +/- 0.79
Episode length: 299.20 +/- 3.97
New best mean reward!
Eval num_timesteps=10000, episode_reward=-99.90 +/- 0.00
Episode length: 999.00 +/- 0.00
Eval num_timesteps=15000, episode_reward=-99.90 +/- 0.00
Episode length: 999.00 +/- 0.00
Eval num_timesteps=20000, episode_reward=90.82 +/- 0.12
Episode length: 102.20 +/- 1.17
New best mean reward!


In [None]:
policy1 = PPO.load('./policies/ppo_mountain_ctn_final')

In [None]:
policy2 = PPO.load('./policies/ppo_mountain_ctn_10000_steps')

In [23]:
def rollout(policy_model, env, seed, max_t=1000):
    obs = env.reset(seed=seed)
    # If env.reset() returns tuple (obs, info), unpack:
    if isinstance(obs, tuple):
        obs = obs[0]
    traj = {"states": [], "actions": [], "rewards": [], "total_reward": 0}

    for _ in range(max_t):
        action, _ = policy_model.predict(obs, deterministic=True)  # Use model.predict()
        traj["states"].append(obs)
        traj["actions"].append(action)

        step_result = env.step(action)
        # Support new gym API returning 5 values:
        if len(step_result) == 5:
            obs, reward, done, truncated, info = step_result
            done = done or truncated
        else:
            obs, reward, done, info = step_result

        traj["rewards"].append(reward)
        traj["total_reward"] += reward
        if done:
            break
    return traj


In [25]:
def make_pref_dataset(pi1, pi2, env, K=1000):
    data = []
    rewards_p1 = []
    rewards_p2 = []
    for _ in range(K):
        seed = random.randint(0, 10000000)
        tau_1, tau_2 = rollout(pi1, env, seed), rollout(pi2, env, seed)
        r1, r2 = tau_1["total_reward"], tau_2["total_reward"]
        rewards_p1.append(r1), rewards_p2.append(r2),
        p1 = math.exp(r1) / (math.exp(r1) + math.exp(r2))

        initial_state = tau_1["states"][0]

        if random.random() < p1:
            data.append((initial_state, tau_1, tau_2))
        else:
            data.append((initial_state, tau_2, tau_1))

    print("Mean p1", np.mean(rewards_p1))
    print("Mean p2", np.mean(rewards_p2))
    return data

In [3]:
import random, math
import numpy as np
from utils import pref_save, pref_load
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import torch.nn.functional as F

In [103]:
K = 100
ENV_NAME = 'MountainCarContinuous-v0'

env = gym.make(ENV_NAME)

print(f"Generating {K} preference pairs …")
pref_data = make_pref_dataset(policy1, policy2, env, K)
pref_save(pref_data, f"pref_data_{K}_{ENV_NAME}.pickle")
print("Finished")

torch.save(policy1.policy.state_dict(), f"policy1_{K}_{ENV_NAME}.pth")
torch.save(policy2.policy.state_dict(), f"policy2_{K}_{ENV_NAME}.pth")

Generating 100 preference pairs …
Mean p1 90.80465787488896
Mean p2 -99.89999999999856
Finished
