In [1]:
%pip install imageio imageio-ffmpeg pygame numerize pathlib casadi stable-baselines3 tensorboard "stable-baselines3[extra]"  pyvirtualdisplay ipywidgets --quiet
%pip install "gymnasium[other]" --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import copy
import gymnasium as gym
import stable_baselines3
from SimulationConfigLoader import SimulationLoader
from Simulation import MapEntity, Map, ArticulatedVehicle, Simulation
from ParkingEnv import ParkingEnv
import random
import Visualization
from casadi import cos, sin, tan
from typing import Any, SupportsFloat
from stable_baselines3 import PPO, SAC
import torch
import os
from IPython.display import HTML, display
from numerize import numerize
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
import platform
from IPython.display import clear_output

  from pkg_resources import resource_stream, resource_exists


In [3]:
log_dir = "logs"

#### Funções de treinamento e avaliação

In [4]:
def evaluate_model(model: PPO, iterations: int = 10):
    rewards = []
    for _ in range(iterations):
        rewards.append(run_episode(model, int(random.random() * 1000)))
    return rewards
    
def run_episode_and_save_video(model):
    video_recorder = Visualization.VideoRecorder("simulation.mp4", fps=10)
    env = ParkingEnv()
    observation, info = env.reset()
    total_reward = 0.0

    while(True):
        action, _ = model.predict(observation, deterministic=True)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += float(reward)
        video_recorder.append(env.render())
        if terminated or truncated:
            break

    video_recorder.close()
    env.close()
    return total_reward

def run_episode(model, seed = None):
    env = ParkingEnv(seed)
    observation, info = env.reset()
    total_reward = 0.0

    while(True):
        action, _ = model.predict(observation, deterministic=True)
        observation, reward, terminated, truncated, info = env.step(action)
        total_reward += float(reward)
        if terminated or truncated:
            break

    env.close()
    return total_reward


from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
import platform

N_ENVS = 8

def make_env(seed: int = 0):
    def _init():
        env = ParkingEnv()
        env = stable_baselines3.common.monitor.Monitor(env, log_dir)
        env.reset(seed=seed)
        return env
    return _init

def make_vector_env(n_envs: int = N_ENVS, use_subproc: bool | None = None):
    """Create a vectorized environment.

    On Windows (especially inside notebooks), `SubprocVecEnv` can be unstable
    and cause BrokenPipeError/EOFError. There we default to `DummyVecEnv`.
    """
    # Auto-select backend if not specified
    if use_subproc is None:
        use_subproc = platform.system() != "Windows"

    env_fns = [make_env(seed=i) for i in range(n_envs)]

    if use_subproc:
        try:
            return SubprocVecEnv(env_fns)
        except Exception as e:
            print(f"SubprocVecEnv failed ({e}), falling back to DummyVecEnv.")

    return DummyVecEnv(env_fns)




def train_sac(model: SAC | None = None, total_timesteps: int = 10000, save_every: int | None = None, save_path: str | None = None, save_name: str | None = None):
    """
        Treina o modelo SAC (Soft Actor-Critic)
        Args:
            model: Modelo SAC a ser treinado, se None, cria um novo modelo, senão treina modelo existente
            total_timesteps: Total de timesteps para treinar
            save_every: O modelo será salvo a cada save_every timesteps durante o treinamento
            save_path: Caminho para o diretório onde o modelo será salvo
            save_name: Nome do modelo
        Returns:
            Modelo SAC treinado
    """
    # Create environment
    env = make_vector_env()

    # Hiperparâmetros da rede neural
    # O README descreve um espaço de observação com 19 dimensões
    # (3 ângulos, 14 raycasts, 2 de meta).
    # Aumentamos a rede para [256, 256] para lidar com essa complexidade.
    # No SAC (Actor-Critic), temos redes 'pi' (policy/ator) e 'qf' (Q-function/crítico).
    policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                         net_arch=dict(pi=[128, 128], qf=[256, 256]))

    # Instancia o modelo SAC usando os hiperparâmetros definidos
    if model is None:
        model = SAC(
            policy="MlpPolicy",          # Arquitetura da política (MLP para observações vetoriais)
            env=env,                     # Ambiente compatível com Gymnasium
            policy_kwargs=policy_kwargs, # Arquitetura de rede customizada
            verbose=0,                   # Verbosidade do log
            tensorboard_log=log_dir,     # Diretório para logs do TensorBoard
            learning_rate=0.00005,        # Taxa de aprendizado (um bom padrão)
            buffer_size=500000,          # Tamanho do replay buffer (SAC é off-policy)
            batch_size=256,              # Tamanho do batch amostrado do buffer
            gamma=0.999,                # Fator de desconto (mantido alto, pois estacionar é uma tarefa de horizonte longo)
            ent_coef="auto",             # Coeficiente de entropia (essencial no SAC, 'auto' aprende automaticamente)
            tau=0.005,                   # Coeficiente de "soft update" para as redes de target
            learning_starts=100,        # Número de passos antes de começar a treinar (coleta experiência)
            device="auto",            # Usa GPU se disponível
            use_sde=True,

        )
    else:
        model.set_env(env)

    timesteps_split = 0
    if save_every is not None:
        timesteps_split = int(total_timesteps / save_every)
        for i in range(timesteps_split):
            model.learn(total_timesteps=save_every, progress_bar=True, tb_log_name=save_name, reset_num_timesteps=False,)
            model_save_dir = os.path.join(save_path, save_name)
            model.save(model_save_dir)
            clear_output(wait=True)

    return model

def train_PPO(model: PPO | None = None, total_timesteps: int = 10000, save_every: int | None = None, save_path: str | None = None, save_name: str | None = None):
    """
        Treina o modelo PPO (Proximal Policy Optimization)
        Args:
            model: Modelo PPO a ser treinado, se None, cria um novo modelo, senão treina modelo existente
            total_timesteps: Total de timesteps para treinar
            save_every: O modelo será salvo a cada save_every timesteps durante o treinamento
            save_path: Caminho para o diretório onde o modelo será salvo
            save_name: Nome do modelo
        Returns:
            Modelo SAC treinado
    """
    # Create environment
    env = make_vector_env()
    # neural network hyperparameters
    # net_arch is a list of number of neurons per hidden layer, e.g. [16,20] means
    # two hidden layers with 16 and 20 neurons, respectively
    policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[128, 128], vf=[128, 128]))

    # instantiates the model using the defined hyperparameters
    if model is None:
        model = PPO(
            policy="MlpPolicy",           # neural network policy architecture (MLP for vector observations)
            env=env,                      # gymnasium-compatible environment to train on
            policy_kwargs=policy_kwargs,  # custom network architecture and activation
            verbose=0,                    # logging verbosity: 0(silent),1(info),2(debug)
            tensorboard_log=log_dir,      # directory for TensorBoard logs
            learning_rate=0.0001,           # optimizer learning rate
            n_steps=2048,                 # rollout steps per environment update
            batch_size=64,                # minibatch size for optimization
            gamma=0.999,                   # discount factor
            gae_lambda=0.95,              # GAE lambda for bias-variance tradeoff
            ent_coef=0.0,                 # entropy coefficient (encourages exploration)
            clip_range=0.2,               # PPO clipping parameter
            n_epochs=10,                  # number of optimization epochs per update
            device="auto"                 # use GPU if available, else CPU
        )
    else:

        model.set_env(env)

    timesteps_split = 0
    if save_every is not None:
        timesteps_split = int(total_timesteps / save_every)
        for i in range(timesteps_split):
            model.learn(total_timesteps=save_every, progress_bar=True, tb_log_name=save_name, reset_num_timesteps=False,)
            model_save_dir = os.path.join(save_path, save_name)
            model.save(model_save_dir)
            clear_output(wait=True)

    return model

#### Funções auxiliares para visualização

In [5]:
# @title Play Video function
from IPython.display import HTML
from base64 import b64encode
import platform

# Only import and use pyvirtualdisplay on Linux
if platform.system() != 'Windows':
    from pyvirtualdisplay import Display
else:
    Display = None

# create the directory to store the video(s)
os.makedirs("./video", exist_ok=True)

# Only start virtual display on Linux (not needed on Windows)
display = None
if platform.system() != 'Windows' and Display is not None:
    display = Display(visible=False, size=(2000, 1500))
    _ = display.start()

"""
Utility functions to enable video recording of gym environment
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""
def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  if not os.path.exists(videopath):
      return f'<p>Video file not found: {videopath}</p>'
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'

def record_and_display_video_manual(env, model, video_name, num_episodes=1):
    """
    Records a video manually using Visualization.VideoRecorder (more reliable).
    
    Args:
        env: The gymnasium environment.
        model: The trained model.
        video_name (str): The name to use for the video file.
        num_episodes (int): The number of episodes to record (default is 1).
    """
    os.makedirs("./video", exist_ok=True)
    
    video_path = f"video/{video_name}.mp4"
    video_recorder = Visualization.VideoRecorder(video_path, fps=10)
    
    total_reward = 0.0
    episode_count = 0
    
    for episode in range(num_episodes):
        observation, info = env.reset()
        episode_reward = 0.0
        
        while(True):
            action, _ = model.predict(observation, deterministic=True)
            observation, reward, terminated, truncated, info = env.step(action)
            episode_reward += float(reward)
            video_recorder.append(env.render())
            
            if terminated or truncated:
                break
        
        total_reward += episode_reward
        episode_count += 1
    
    video_recorder.close()
    print(f"\nTotal reward: {total_reward}")
    print(f"Video saved to: {video_path}")
    
    html = render_mp4(video_path)
    return HTML(html)

def record_and_display_video(env, model, video_name, num_episodes=1):
    """
    Records a video of the agent performing in the environment and displays it.

    Args:
        env: The gymnasium environment.
        model: The trained model.
        video_name (str): The name to use for the video file.
        num_episodes (int): The number of episodes to record (default is 1).
    """
    import glob
    
    # create the directory to store the video(s)
    os.makedirs("./video", exist_ok=True)

    # Use a virtual display for rendering (only on Linux)
    display = None
    if platform.system() != 'Windows' and Display is not None:
        display = Display(visible=False, size=(1400, 900))
        _ = display.start()

    env_name = "ParkingEnv"

    env = gym.wrappers.RecordVideo(
        env,
        video_folder="video",
        name_prefix=f"{env_name}_{video_name}",
        episode_trigger=lambda episode_id: episode_id < num_episodes
    )

    observation, _ = env.reset()
    total_reward = 0
    done = False
    episode_count = 0

    while not done:
        action, states = model.predict(observation, deterministic=True)
        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_reward += reward
        if done:
            episode_count += 1
            if episode_count < num_episodes:
                observation, _ = env.reset()
                done = False

    env.close()
    # Stop the virtual display if it was started
    if display is not None:
        display.stop()

    print(f"\nTotal reward: {total_reward}")

    # Find the video file that was created
    video_pattern = f"video/{env_name}_{video_name}*.mp4"
    video_files = glob.glob(video_pattern)
    
    if not video_files:
        # Try alternative pattern
        video_pattern = f"video/*{video_name}*.mp4"
        video_files = glob.glob(video_pattern)
    
    if not video_files:
        # List all video files for debugging
        all_videos = glob.glob("video/*.mp4")
        print(f"Warning: Expected video file not found. Available video files: {all_videos}")
        return HTML("<p>Video file not found. Check the video directory.</p>")
    
    # Use the first matching video file
    video_path = video_files[0]
    print(f"Found video file: {video_path}")
    
    # show video
    html = render_mp4(video_path)
    return HTML(html)

In [6]:
import shutil
def clean_logs():
    if os.path.exists(log_dir):
        print("Cleaning logs")
        shutil.rmtree(log_dir)
    os.makedirs(log_dir, exist_ok=True)



In [7]:
algorithm = "SAC"
model_name = "SAC_21_11_V4"
model_save_dir = "models"
total_training_timesteps = 2000000
save_every = 100000

os.makedirs(model_save_dir, exist_ok=True)

In [8]:
#se modelo salvo já existe, carrega
if(algorithm == "SAC"):
    if(os.path.exists(os.path.join(model_save_dir, model_name + ".zip"))):
        model_save_path = os.path.join(model_save_dir, model_name + ".zip")
        model = SAC.load(model_save_path)
    else: #senão, cria novo
        model = None   
    model = train_sac(model, total_timesteps=total_training_timesteps, save_every=save_every, save_path=model_save_dir, save_name=model_name)

if(algorithm == "PPO"):
    if(os.path.exists(os.path.join(model_save_dir, model_name + ".zip"))):
        model_save_path = os.path.join(model_save_dir, model_name + ".zip")
        model = PPO.load(model_save_path)
    else: #senão, cria novo
        model = None
    model = train_PPO(model, total_timesteps=total_training_timesteps, save_every=save_every, save_path=model_save_dir, save_name=model_name)


Output()

#### Carregar modelo já existente

In [9]:
model_save_path = os.path.join(model_save_dir, model_name + ".zip")
model = SAC.load(model_save_path)

In [10]:
env = ParkingEnv()
record_and_display_video_manual(env, model, "ppo_model", num_episodes=1)

Saved 450 frames as MP4 to video/ppo_model.mp4

Total reward: 22.28734077874472
Video saved to: video/ppo_model.mp4


In [11]:
total_rewards = evaluate_model(model, 10)
total_rewards = np.array(total_rewards)
print(f"mean reward {total_rewards.mean()}")
print(f"std reward {total_rewards.std()}")
print(f"min reward {total_rewards.min()}")
print(f"max reward {total_rewards.max()}")


mean reward -121.5243447293553
std reward 106.20507175161052
min reward -256.6758143262863
max reward 96.84610974030214
