In [23]:
import os
import sys
import warnings

import datetime
import math
import numpy as np
import pandas as pd
import platform
import re
import subprocess
import torch as th

import os
import pickle

import gymnasium as gym
import matplotlib.pyplot as plt

from datetime import datetime
from scripts.utils import *
from scripts.visualizations import *
from src.config import *

import os
import numpy as np
import torch as th
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.distributions import CategoricalDistribution
from stable_baselines3.common.logger import configure
from scripts.utils import *
from src.environment import AircraftDisruptionEnv

In [24]:
# PPO-specific hyperparameters
PPO_LEARNING_RATE = 0.0003
PPO_N_STEPS = 2048
PPO_BATCH_SIZE = 64
PPO_N_EPOCHS = 10
PPO_GAMMA = 0.99
PPO_CLIP_RANGE = 0.2

# Shared constants
TRAINING_FOLDERS_PATH = '../data/Training/1k-3ac-12f-1dis-F/'
TESTING_FOLDERS_PATH = '../data/Testing/1k-3ac-12f-1dis-F/'
N_EPISODES = 500
MODEL_SAVE_PATH = '../trained_models/'

In [25]:
# Initialize device
device = initialize_device()

# Check device capabilities
check_device_capabilities()

# Get device-specific information
device_info = get_device_info(device)
print(f"Device info: {device_info}")

# Verify training folders and gather training data
training_folders = verify_training_folders(TRAINING_FOLDERS_PATH)
# print(f"Training folders: {training_folders}")

# Calculate training days and model naming
num_days_trained_on = calculate_training_days(N_EPISODES, training_folders)
print(f"Training on {num_days_trained_on} days of data "
        f"({N_EPISODES} episodes of {len(training_folders)} scenarios)")

formatted_days = format_days(num_days_trained_on)
last_folder = os.path.basename(os.path.normpath(TRAINING_FOLDERS_PATH))
model_name = last_folder
model_version = get_model_version(model_name)
MODEL_SAVE_PATH = f'../trained_models/'
MODEL_SAVE_NAME = f'{model_name}-{formatted_days}-{model_version}.zip'
print(f"Models will be saved to:")
print(f"   {MODEL_SAVE_PATH}myopic_{MODEL_SAVE_NAME}")
print(f"   {MODEL_SAVE_PATH}proactive_{MODEL_SAVE_NAME}")

# Create results directory
results_dir = create_results_directory()
print(f"Results directory created at: {results_dir}")

Using device: mps
CUDA available: False
Number of GPUs available: 0
cuDNN enabled: True
Device: mps
Using MacBook M1
Device info: {'device_type': 'MacBook M1'}
Training on 500000 days of data (500 episodes of 1000 scenarios)
Models will be saved to:
   ../trained_models/myopic_1k-3ac-12f-1dis-F-500k-1.zip
   ../trained_models/proactive_1k-3ac-12f-1dis-F-500k-1.zip
Results directory created at: ../results/20241120-06-49


In [26]:
import os
import numpy as np
import torch as th
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.distributions import CategoricalDistribution
from stable_baselines3.common.logger import configure
from stable_baselines3.common.env_util import make_vec_env
from scripts.utils import *
from src.environment import AircraftDisruptionEnv
from stable_baselines3.common.env_checker import check_env

# Constants
PPO_LEARNING_RATE = 0.0003
PPO_N_STEPS = 2048
PPO_BATCH_SIZE = 64
PPO_N_EPOCHS = 10
PPO_GAMMA = 0.99
PPO_CLIP_RANGE = 0.2
N_EPISODES = 500
MODEL_SAVE_PATH = '../trained_models/'
TRAINING_FOLDERS_PATH = '../data/Training/1k-3ac-12f-1dis-F/'
TESTING_FOLDERS_PATH = '../data/Testing/1k-3ac-12f-1dis-F/'

# Custom environment wrapper to enforce float32 observations
class Float32AircraftDisruptionEnv(AircraftDisruptionEnv):
    def reset(self, **kwargs):
        obs, info = super().reset(**kwargs)
        obs = self.cast_to_float32(obs)
        return obs, info

    def step(self, action):
        obs, reward, done, truncated, info = super().step(action)
        obs = self.cast_to_float32(obs)
        return obs, reward, done, truncated, info

    @staticmethod
    def cast_to_float32(obs):
        """
        Recursively cast all observation components to float32.
        """
        if isinstance(obs, dict):
            return {key: Float32AircraftDisruptionEnv.cast_to_float32(value) for key, value in obs.items()}
        elif isinstance(obs, np.ndarray):
            return obs.astype(np.float32)
        else:
            return np.array(obs, dtype=np.float32)

# Training function
def train_ppo_agent(env_type):
    # Prepare logging
    logger = configure()

    # List all the scenario folders
    scenario_folders = [
        os.path.join(TRAINING_FOLDERS_PATH, folder)
        for folder in os.listdir(TRAINING_FOLDERS_PATH)
        if os.path.isdir(os.path.join(TRAINING_FOLDERS_PATH, folder))
    ]

    # Initialize the environment using the first scenario folder
    dummy_scenario_folder = scenario_folders[0]
    data_dict = load_scenario_data(dummy_scenario_folder)
    env = Float32AircraftDisruptionEnv(
        data_dict['aircraft'],
        data_dict['flights'],
        data_dict['rotations'],
        data_dict['alt_aircraft'],
        data_dict['config'],
        env_type=env_type
    )

    check_env(env)  # Ensure the environment is compatible with SB3

    vec_env = DummyVecEnv([lambda: env])

    # Initialize the PPO model
    model = PPO(
        policy="MultiInputPolicy",  # Supports dict-based observations
        env=vec_env,
        learning_rate=PPO_LEARNING_RATE,
        n_steps=PPO_N_STEPS,
        batch_size=PPO_BATCH_SIZE,
        n_epochs=PPO_N_EPOCHS,
        gamma=PPO_GAMMA,
        clip_range=PPO_CLIP_RANGE,
        verbose=1,
        device="auto"
    )

    model._logger = logger

    # Training loop
    for episode in range(N_EPISODES):
        for scenario_folder in scenario_folders:
            # Reload the environment with a new scenario
            data_dict = load_scenario_data(scenario_folder)
            env = Float32AircraftDisruptionEnv(
                data_dict['aircraft'],
                data_dict['flights'],
                data_dict['rotations'],
                data_dict['alt_aircraft'],
                data_dict['config'],
                env_type=env_type
            )
            model.set_env(DummyVecEnv([lambda: env]))

            # Train the model
            model.learn(total_timesteps=PPO_N_STEPS)

        print(f"{env_type}: Episode {episode + 1}/{N_EPISODES} completed.")

    # Save the trained model
    model_save_name = f'{env_type}_ppo_with_casting.zip'
    model.save(os.path.join(MODEL_SAVE_PATH, model_save_name))
    print(f"Model saved to {os.path.join(MODEL_SAVE_PATH, model_save_name)}")

    return model

# Train the myopic and proactive PPO agents
if __name__ == "__main__":
    print("Starting PPO training for Myopic agent...")
    myopic_model = train_ppo_agent('myopic')

    print("Starting PPO training for Proactive agent...")
    proactive_model = train_ppo_agent('proactive')

    print("Training completed for both agents.")


Starting PPO training for Myopic agent...
Logging to /var/folders/m6/gwyqzldd12bg_s3mrl40tp6r0000gn/T/SB3-2024-11-20-06-49-00-087313


AssertionError: Error while checking key=action_mask: The observation returned by the `reset()` method does not match the data type (cannot cast) of the given observation space Box(0, 1, (52,), uint8). Expected: uint8, actual dtype: float32