In [10]:
import os
import time
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from gymnasium.envs.registration import register
from ppo_env.ppo import BattleshipEnv  # Assuming your BattleshipEnv is saved as battleship_env.py

import pandas as pd
import hvplot.pandas
from tqdm import tqdm

# Register the custom Battleship environment
register(
    id='BattleshipEnvSD-v0',
    entry_point='ppo_env.ppo:BattleshipEnv',  # Module path to the BattleshipEnv class
    kwargs={"board_size": 10, "ships": [2, 3, 3, 4, 5]},
    # max_episode_steps=100,  # Adjust based on game rules or preferences
)

# Create a directory for logging
timestr = time.strftime("%Y%m%d-%H%M%S")
log_dir = f"tmp/{timestr}/"
os.makedirs(log_dir, exist_ok=True)

# Initialize the environment and wrap it with Monitor for logging
env = gym.make("BattleshipEnvSD-v0")

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [15]:
env = Monitor(env, log_dir)

# Use DummyVecEnv to enable vectorized environments (needed for stable-baselines3)
vec_env = DummyVecEnv([lambda: env])

# Initialize the PPO model
model = PPO("MlpPolicy", vec_env, verbose=1)

# Train the model
model.learn(total_timesteps=100000)

# Save the trained model
model_save_path = f"models/{timestr}-ppo-battleship"
os.makedirs("models", exist_ok=True)
model.save(model_save_path)


Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 349      |
|    ep_rew_mean     | -242     |
| time/              |          |
|    fps             | 3126     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 388           |
|    ep_rew_mean          | -281          |
| time/                   |               |
|    fps                  | 2318          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.018551514   |
|    clip_fraction        | 0.159         |
|    clip_range           | 0.2           |
|    entropy_loss         | -4.6          |
|    explained_variance   |

In [16]:
# Read the monitor.csv file
monitor_file = f"{log_dir}/monitor.csv"
monitor_data = pd.read_csv(monitor_file, skiprows=1)  # Skip the first line (metadata)

# Check the structure of monitor_data
print(monitor_data.head())

# Extract episode numbers and rewards
training_reward = monitor_data[["r", "l"]].copy()  # 'r' is reward, 'l' is episode length
training_reward = training_reward.reset_index()
training_reward.rename(columns={"index": "Episode", "r": "Reward"}, inplace=True)

# Plot rewards vs. episodes
training_reward.hvplot(x="Episode", y="Reward", title="Training Reward over Episodes")

       r      l         t
0 -230.1  337.0  0.107146
1 -196.1  303.0  0.269331
2 -372.3  481.0  0.395936
3 -124.3  224.0  0.455451
4 -289.3  398.0  0.571148


In [17]:
# Load and test the model
model = PPO.load(model_save_path)
model

<stable_baselines3.ppo.ppo.PPO at 0x3035bc370>