<a href="https://colab.research.google.com/github/ObaOzai/SharedCode/blob/main/BreakOutV1.0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breakout ATARI RL Study V 1.0
### Juan David Correa http://www.astropema.com May 2025

In [1]:
#!pip install ale-py==0.8.1 gymnasium==0.28.1 stable-baselines3==2.0.0
#!pip install autorom[accept-rom-license]
#!AutoROM --accept-license
#%pip install gymnasium[atari]



In [None]:
# breakout_ppo_full.py
# Fully corrected and complete script for PPO training on Breakout with Gymnasium, ALE and Stable-Baselines3

import os
import time
import logging
from pathlib import Path
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

# Suppress TensorFlow/CUDA noise if TF is imported under the hood
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("BreakoutPPO")

# --- Configuration ---
GAME_ID        = "Breakout"
ENV_ID         = f"ALE/{GAME_ID}-v5"
SEED           = 42
EVAL_SEED      = 5678
NUM_ENVS       = 2
TOTAL_STEPS    = 5_000_000
N_STEPS        = 64
BATCH_SIZE     = 32
N_EPOCHS       = 2
LEARNING_RATE  = 2.5e-4
CLIP_RANGE     = 0.2
ENT_COEF       = 0.01
GAMMA          = 0.99
GAE_LAMBDA     = 0.95
VF_COEF        = 0.5
MAX_GRAD_NORM  = 0.5

# Directory structure
BASE_DIR = Path("./logs") / f"breakout_ppo_{int(time.time())}"
CKPT_DIR = BASE_DIR / "checkpoints"
EVAL_DIR = BASE_DIR / "eval"

# Ensure directories exist
for d in [BASE_DIR, CKPT_DIR, EVAL_DIR]:
    d.mkdir(parents=True, exist_ok=True)
    log.info(f"Ensured directory: {d}")

# Utility to create and seed Gym environment
def make_env(seed: int):
    def _init():
        # Create Atari environment
        env = gym.make(ENV_ID, obs_type='rgb', frameskip=4)
        # Monitor wrapper for ep_rew_mean and ep_len_mean
        env = Monitor(env)
        # Seed and reset
        env.reset(seed=seed)
        return env
    return _init


def main():
    # Create vectorized training environments
    log.info(f"Creating {NUM_ENVS} training environments with seed {SEED}")
    train_env = DummyVecEnv([make_env(SEED + i) for i in range(NUM_ENVS)])
    train_env = VecFrameStack(train_env, n_stack=4)

    # Create evaluation environment
    log.info("Creating evaluation environment")
    eval_env = DummyVecEnv([make_env(EVAL_SEED)])
    eval_env = VecFrameStack(eval_env, n_stack=4)

    # Initialize PPO model
    model = PPO(
        policy="CnnPolicy",
        env=train_env,
        learning_rate=LEARNING_RATE,
        n_steps=N_STEPS,
        batch_size=BATCH_SIZE,
        n_epochs=N_EPOCHS,
        gamma=GAMMA,
        gae_lambda=GAE_LAMBDA,
        clip_range=CLIP_RANGE,
        ent_coef=ENT_COEF,
        vf_coef=VF_COEF,
        max_grad_norm=MAX_GRAD_NORM,
        verbose=1,
        tensorboard_log=str(BASE_DIR),
        device="auto"
    )

    # Callbacks: checkpoints and evaluation
    checkpoint_cb = CheckpointCallback(
        save_freq=100_000,
        save_path=str(CKPT_DIR),
        name_prefix="ppo_breakout"
    )
    eval_cb = EvalCallback(
        eval_env,
        best_model_save_path=str(BASE_DIR / "best"),
        log_path=str(EVAL_DIR),
        eval_freq=50_000,
        n_eval_episodes=5,
        deterministic=True,
        render=False
    )

    # Train
    log.info(f"Starting training for {TOTAL_STEPS:,} timesteps")
    model.learn(
        total_timesteps=TOTAL_STEPS,
        callback=[checkpoint_cb, eval_cb],
        tb_log_name="breakout_ppo",
        reset_num_timesteps=True
    )

    # Save final model
    final_path = BASE_DIR / "ppo_breakout_final"
    model.save(str(final_path))
    log.info(f"Saved final model to {final_path}")

    # Clean up
    train_env.close()
    eval_env.close()

if __name__ == "__main__":
    main()


Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to logs/breakout_ppo_1746137934/breakout_ppo_1




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss           | 0.0221       |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 226          |
|    ep_rew_mean          | 2.52         |
| time/                   |              |
|    fps                  | 14           |
|    iterations           | 481          |
|    time_elapsed         | 4280         |
|    total_timesteps      | 61568        |
| train/                  |              |
|    approx_kl            | 0.0021961173 |
|    clip_fraction        | 0.0156       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.587       |
|    explained_variance   | 0.9674257    |
|    learning_rate        | 0.00025      |
|    loss                 | 0.0136       |
|    n_updates            | 960          |
|    policy_gradient_loss | 0.000367     |
|    value_loss           | 0.01