In [2]:
!pip install "gymnasium==0.28.1"
!pip install stable_baselines3
!pip install swig
!pip install gymnasium[box2d]

Collecting gymnasium==0.28.1
  Downloading gymnasium-0.28.1-py3-none-any.whl.metadata (9.2 kB)
Collecting jax-jumpy>=1.0.0 (from gymnasium==0.28.1)
  Downloading jax_jumpy-1.0.0-py3-none-any.whl.metadata (15 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium==0.28.1)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Downloading jax_jumpy-1.0.0-py3-none-any.whl (20 kB)
Installing collected packages: farama-notifications, jax-jumpy, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.28.1 jax-jumpy-1.0.0
Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━

In [3]:
import os
import json

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3 import TD3
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


  and should_run_async(code)


# TD3 on LunarLanderContinuous-v2 with stable_baselines3

In [5]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model based on the training reward
    and tracking the actor losses, critic losses, and rewards per step.

    :param check_freq: Frequency to check and save model.
    :param log_dir: Directory to save the model.
    :param verbose: Verbosity level.
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf
        self.actor_losses = []
        self.critic_losses = []
        self.rewards_per_step = []
        self.episode_reward = 0
        self.num_steps_in_episode = 0

    def _init_callback(self) -> None:
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        # Get the current reward from 'infos'
        reward = self.locals['rewards'][0]  # Reward for the current step
        self.episode_reward += reward
        self.num_steps_in_episode += 1

        # Record reward per step
        self.rewards_per_step.append(reward)

        if self.n_calls % self.check_freq == 0:
            # Retrieve the training reward (mean reward of the last 100 steps)
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                if self.verbose >= 1:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per step: {mean_reward:.2f}")

                # Save model if a new best reward is found
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    if self.verbose >= 1:
                        print(f"Saving new best model to {self.save_path}")
                    self.model.save(self.save_path)

        # Access and log losses from the logger dictionary
        self.actor_losses.append(self.logger.name_to_value['train/actor_loss'])
        self.critic_losses.append(self.logger.name_to_value['train/critic_loss'])

        return True

In [7]:
results = {}

seeds = [1,2,3,4,5]
callbacks = []
log_dirs = []
models = []

for seed in seeds:
    # Set log directory
    log_dir = f"tmp{seed}/"
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make("LunarLanderContinuous-v2")
    env.reset(seed=seed)
    env = Monitor(env, log_dir)

    # Add action noise for exploration
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

    # Initialize model
    model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0, device="cuda")

    # Create callback to save model and track losses
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

    callbacks.append(callback)
    log_dirs.append(log_dir)
    models.append(model)

    # Train the model
    timesteps = 1e5
    model.learn(total_timesteps=int(timesteps), callback=callback)

    # Stocker les résultats de l'entraînement pour cette seed
    results = {
        "actor_losses": [float(loss) for loss in callback.actor_losses],
        "critic_losses": [float(loss) for loss in callback.critic_losses],
        "rewards_per_step": [float(reward) for reward in callback.rewards_per_step],
    }

    # Sauvegarde des résultats dans un fichier JSON après chaque seed
    with open(f"training_results_seed{seed}.json", "w") as f:
        json.dump(results, f, indent=4)

    print(f"Les résultats de l'entraînement pour seed={seed} ont été enregistrés dans training_results_seed{seed}.json.")

Num timesteps: 1000
Best mean reward: -inf - Last mean reward per step: -545.48
Saving new best model to tmp1/best_model
Num timesteps: 2000
Best mean reward: -545.48 - Last mean reward per step: -429.08
Saving new best model to tmp1/best_model
Num timesteps: 3000
Best mean reward: -429.08 - Last mean reward per step: -396.68
Saving new best model to tmp1/best_model
Num timesteps: 4000
Best mean reward: -396.68 - Last mean reward per step: -377.72
Saving new best model to tmp1/best_model
Num timesteps: 5000
Best mean reward: -377.72 - Last mean reward per step: -371.27
Saving new best model to tmp1/best_model
Num timesteps: 6000
Best mean reward: -371.27 - Last mean reward per step: -358.07
Saving new best model to tmp1/best_model
Num timesteps: 7000
Best mean reward: -358.07 - Last mean reward per step: -347.85
Saving new best model to tmp1/best_model
Num timesteps: 8000
Best mean reward: -347.85 - Last mean reward per step: -336.57
Saving new best model to tmp1/best_model
Num timeste

In [10]:
results_part2 = {}

seeds = [6,7,8,9,10]
callbacks = []
log_dirs = []
models = []

for seed in seeds:
    # Set log directory
    log_dir = f"tmp{seed}/"
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make("LunarLanderContinuous-v2")
    env.reset(seed=seed)
    env = Monitor(env, log_dir)

    # Add action noise for exploration
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

    # Initialize model
    model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0, device="cuda")

    # Create callback to save model and track losses
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

    callbacks.append(callback)
    log_dirs.append(log_dir)
    models.append(model)

    # Train the model
    timesteps = 1e5
    model.learn(total_timesteps=int(timesteps), callback=callback)

    # Stocker les résultats de l'entraînement pour cette seed
    results = {
        "actor_losses": [float(loss) for loss in callback.actor_losses],
        "critic_losses": [float(loss) for loss in callback.critic_losses],
        "rewards_per_step": [float(reward) for reward in callback.rewards_per_step],
    }

    # Sauvegarde des résultats dans un fichier JSON après chaque seed
    with open(f"training_results_seed{seed}.json", "w") as f:
        json.dump(results, f, indent=4)

    print(f"Les résultats de l'entraînement pour seed={seed} ont été enregistrés dans training_results_seed{seed}.json.")

Num timesteps: 1000
Best mean reward: -inf - Last mean reward per step: -783.99
Saving new best model to tmp10/best_model
Num timesteps: 2000
Best mean reward: -783.99 - Last mean reward per step: -688.45
Saving new best model to tmp10/best_model
Num timesteps: 3000
Best mean reward: -688.45 - Last mean reward per step: -599.44
Saving new best model to tmp10/best_model
Num timesteps: 4000
Best mean reward: -599.44 - Last mean reward per step: -585.61
Saving new best model to tmp10/best_model
Num timesteps: 5000
Best mean reward: -585.61 - Last mean reward per step: -541.33
Saving new best model to tmp10/best_model
Num timesteps: 6000
Best mean reward: -541.33 - Last mean reward per step: -490.16
Saving new best model to tmp10/best_model
Num timesteps: 7000
Best mean reward: -490.16 - Last mean reward per step: -458.31
Saving new best model to tmp10/best_model
Num timesteps: 8000
Best mean reward: -458.31 - Last mean reward per step: -441.28
Saving new best model to tmp10/best_model
Num