In [6]:
!pip install "gymnasium[other]"

Collecting moviepy>=1.0.0 (from gymnasium[other])
  Downloading moviepy-2.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting imageio<3.0,>=2.5 (from moviepy>=1.0.0->gymnasium[other])
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy>=1.0.0->gymnasium[other])
  Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting proglog<=1.0.0 (from moviepy>=1.0.0->gymnasium[other])
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting python-dotenv>=0.10 (from moviepy>=1.0.0->gymnasium[other])
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pillow>=8 (from matplotlib>=3.0->gymnasium[other])
  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading moviepy-2.1.2-py3-none-any.whl (126 kB)
Downloading imageio-2.37.0-py3-none-any.whl (315 kB)
Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl (29.

In [1]:
%matplotlib inline
%load_ext tensorboard

%tensorboard --logdir ./logs_new/ --host=0.0.0.0

In [14]:
import gymnasium as gym
from gymnasium.envs.registration import register, registry
from gymnasium.wrappers import RecordVideo
import time
import numpy as np
import pygame

import matplotlib
import matplotlib.pyplot as plt

from typing import Any, Dict
import torch
import torch.nn as nn
import tensorboard

from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

In [3]:
if 'MarineEnv-v0' not in registry:
    register(
        id='MarineEnv-v0',
        entry_point='environments:MarineEnv',  # String reference to the class
    )

In [None]:
# is_ipython = 'inline' in matplotlib.get_backend()
# if is_ipython:
#     from IPython import display

# plt.ion()

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
timescale = 1 / 3
env_kwargs = dict(
    render_mode='rgb_array',
    continuous=True,
    max_episode_steps=int(400 / timescale),
    training_stage=2,
    timescale=timescale,
    training=True,
    total_targets=3,
)

In [7]:
train_env = make_vec_env(env_id='MarineEnv-v0', n_envs=8, env_kwargs=env_kwargs)
# eval_env = make_vec_env(env_id='MarineEnv-v0', n_envs=1, env_kwargs=env_kwargs)
eval_env = gym.make('MarineEnv-v0', **env_kwargs)
video_folder = './video/'
trigger = lambda x: x % 2 == 0
eval_env = RecordVideo(eval_env, video_folder, episode_trigger=trigger)

In [8]:
# Linear decay from 3e-4 to 1e-4
initial_lr = 1e-3
final_lr = 1e-4
learning_rate_schedule = lambda progress_remaining: final_lr + (initial_lr - final_lr) * progress_remaining


kwargs = {
    'learning_rate': 0.0006440700528750355,
    'n_steps': 2**10,
    'batch_size': 2**10,
    'gamma': 0.9595334803327593,
    'gae_lambda': 0.9284293803518315,
    'clip_range': 0.15247146447858756,
    'ent_coef': 0.00017106771534852204,
    'vf_coef': 0.8697801969581918,
    'max_grad_norm': 1.1421017563147962,
    'target_kl': 0.19795582328410327,
    'n_epochs': 6,
    
    # 'clip_range': 0.2,  # Reduce to prevent large updates
    # 'ent_coef': 5e-2,  # Higher entropy to encourage exploration
    # 'gamma': 0.99, 
    # 'learning_rate': learning_rate_schedule,
    # 'n_steps': 2048,  # Increase from default (512) to 2048
    # 'batch_size': 512,  # Adjust batch size for stability
    # 'gae_lambda': 0.95,  # Generalized Advantage Estimation smoothing
    # 'max_grad_norm': 0.9, 
    'device': 'cpu',
    'tensorboard_log': './logs_new/',
    'policy_kwargs': {'net_arch': [128, 128], 'activation_fn': torch.nn.Tanh},  # Slightly deeper network
}

In [9]:
# Create the evaluation callback
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path='./logs_new/best_model/',
    log_path='./logs_new/results/',
    eval_freq=5000,
    deterministic=True,
    render=False
)

In [10]:
agent = PPO(
    policy='MlpPolicy',
    env=train_env,
    verbose=0,
    **kwargs
)

In [None]:
agent.learn(total_timesteps=(2e5), reset_num_timesteps=False, progress_bar=True, tb_log_name='ppo_9', callback=eval_callback)

In [11]:
best_agent = agent.load('./logs_new/best_model/best_model.zip', device='cpu')

In [None]:
eval_env = gym.make('MarineEnv-v0', **env_kwargs)
mean, std = evaluate_policy(model=best_agent, env=eval_env, n_eval_episodes=10, deterministic=True)
print(f'Mean: {mean:.2f}, Std: {std:.2f}')

In [None]:
# Save environment normalization stats
# env.save("ppo_normalized_env.pkl")
# agent.save("ppo")
best_agent.save('ppo')
# agent = agent.load("ppo", device='cpu')
# model = model.load('ppo_marine_stage_1.zip')

In [None]:
env = VecNormalize.load("ppo_normalized_env.pkl", env)

# Disable reward normalization for evaluation
env.training = False
env.norm_reward = False

import cv2
import numpy as np

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, dones, _ = env.step(action)

    # ✅ Ensure env.get_images() is not empty
    images = env.get_images()
    if images and images[0] is not None:
        frame = images[0]
        
        # ✅ Ensure the frame has valid dimensions before displaying
        if frame.shape[0] > 0 and frame.shape[1] > 0:
            cv2.imshow("PPO MarineEnv Evaluation", frame)
            cv2.waitKey(1)  # Display for 1ms
        else:
            print("Warning: Received an empty frame from env.get_images()")

    if dones:
        break

env.close()
cv2.destroyAllWindows()  # Close display window


In [12]:
timescale = 1/6
env_trn = gym.make('MarineEnv-v0', render_mode='human', continuous=True, training_stage=2, timescale=timescale, training=False, total_targets=3)
for _ in range(5):
    
    state, _ = env_trn.reset()
    print(f'Detected targets:', [target for target in env_trn.unwrapped.own_ship.detected_targets])
    print(state)
    episode_rewards = 0 
    # flatten_state = flatten(env.observation_space, state)
    # state = torch.tensor(flatten_state, dtype=torch.float32, device=device).unsqueeze(0)
    for _ in range(int(400 / timescale)):
        action = best_agent.predict(state, deterministic=True)
        # action = agent.predict(state, deterministic=True)
        # print(action)
        # observation, reward, terminated, truncated, info = env_trn.step((0, 0))
        observation, reward, terminated, truncated, info = env_trn.step(action[0])
        env_trn.render()
        time.sleep(0.005)
        episode_rewards += reward
        print('===========================')
        print(observation)
        print(f'Step reward: {reward:.2f}')
        print(f'Current Total reward: {episode_rewards:.2f}')
        print(f'Dangerous targets: ', [target for target in env_trn.unwrapped.own_ship.dangerous_targets])
        
        if terminated or truncated:
            print('Episode total reward: ', episode_rewards)
            print(info)
            break
    
        state = observation
            
    print('Episode total rewards: ', episode_rewards)
    print('Episode final state: ', state)
    print(f'============================\n' * 10)
    env_trn.close()

Detected targets: [Target:
Position: (0.4053440247476886, 0.28233145259015063)
Course: 265.39
Speed: 18.95
Relative Bearing: 86.17
Distance: 7.66
Relative Course: 186.83
Relative Speed: 3.45
CPA: 0.64
TCPA: 132.90
BCR: -0.65
TBC: 135.14
IsDangerous: False
Aspect: crossing
, Target:
Position: (0.2853183156181675, 0.15226575483402627)
Course: 19.35
Speed: 2.03
Relative Bearing: -2.44
Distance: 7.55
Relative Course: 89.95
Relative Speed: 19.16
CPA: 0.46
TCPA: 23.60
BCR: -0.49
TBC: 24.11
IsDangerous: False
Aspect: static
, Target:
Position: (0.3953038011236159, 0.3213533365322092)
Course: 231.08
Speed: 39.61
Relative Bearing: 104.47
Distance: 7.53
Relative Course: 204.73
Relative Speed: 29.49
CPA: 0.58
TCPA: 15.27
BCR: -1.30
TBC: 17.63
IsDangerous: False
Aspect: crossing
]
[275.87347   18.580944  13.799662  44.56069   -4.837437  44.56069
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.     

In [None]:
import os
from gymnasium.wrappers import RecordVideo

from IPython.display import HTML
from base64 import b64encode

# Create a folder to save videos
video_folder = './video'
os.makedirs(video_folder, exist_ok=True)

env_trn = gym.make('MarineEnv-v0', render_mode='rgb_array', continuous=True, training_stage=2, timescale=timescale, training=False, total_targets=3)

# Wrap the environment
env = RecordVideo(env_trn, video_folder)

# Run a single episode to record
for _ in range(5):
    state, _ = env.reset()
    for _ in range(int(400 / timescale)):
            action = best_agent.predict(state, deterministic=True)
            observation, reward, terminated, truncated, info = env.step(action[0])
            time.sleep(0.005)
        
            if terminated or truncated:
                break
       
            state = observation

env.close()

# Locate the video file (the Monitor wrapper saves it with an .mp4 extension)
import glob
video_files = glob.glob(video_folder + "/*.mp4")
print("Recorded video files:", video_files)



In [None]:
# Display the first recorded video (if available)
if video_files:
    video_file = video_files[0]
    video = open(video_file, "rb").read()
    video_b64 = b64encode(video).decode("utf-8")
    HTML(f"""
    <video width="640" height="480" controls>
      <source src="data:video/mp4;base64,{video_b64}" type="video/mp4">
    </video>
    """)