In [1]:
# train_dqn.py

# %% [markdown]
# ### Import packages

import os
import sys
import warnings
import datetime
import math
import numpy as np
import pandas as pd
import platform
import re
import subprocess
import torch as th

import pickle
import gymnasium as gym
import matplotlib.pyplot as plt

from datetime import datetime
from scripts.utils import *
from scripts.visualizations import *
from src.config import *
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.logger import configure  # Import the configure function
from stable_baselines3.common.utils import polyak_update, set_random_seed
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

from scripts.utils import NumpyEncoder
from scripts.logger import *
import json
import seaborn as sns
sns.set(style="darkgrid")

# %% [markdown]
# ### Hyperparameters

if 'MAX_TOTAL_TIMESTEPS' not in globals():
    MAX_TOTAL_TIMESTEPS = 100
if 'TRAINING_FOLDERS_PATH' not in globals():
    TRAINING_FOLDERS_PATH = "../data/Training/6ac-100-mixed-low/"
if 'SEEDS' not in globals():
    SEEDS = [0]
if 'brute_force_flag' not in globals():
    brute_force_flag = False

LEARNING_RATE = 0.0001
GAMMA = 0.99
BUFFER_SIZE = 100000
BATCH_SIZE = 128
TARGET_UPDATE_INTERVAL = 1000
NEURAL_NET_STRUCTURE = dict(net_arch=[256, 256])
LEARNING_STARTS = 10000
TRAIN_FREQ = 4

EPSILON_START = 1.0
EPSILON_MIN = 0.025
PERCENTAGE_MIN = 95
EPSILON_TYPE = "exponential"
if EPSILON_TYPE == "linear":
    EPSILON_MIN = 0

N_EPISODES = 50
cross_val_flag = False
CROSS_VAL_INTERVAL = N_EPISODES/100

TESTING_FOLDERS_PATH = "../data/Training/3ac-10-deterministic/"
stripped_scenario_folder = TRAINING_FOLDERS_PATH.split("/")[-2]
print(f"Training on {stripped_scenario_folder}")
num_scenarios_training = len(os.listdir(TRAINING_FOLDERS_PATH))

# based on MAX_TOTAL_TIMESTEPS etc. calculate EPSILON_DECAY_RATE
EPSILON_DECAY_RATE = calculate_epsilon_decay_rate(
    MAX_TOTAL_TIMESTEPS, EPSILON_START, EPSILON_MIN, PERCENTAGE_MIN, EPSILON_TYPE
)
print("EPSILON DECAY RATE: ", EPSILON_DECAY_RATE)

device = initialize_device()
check_device_capabilities()
device_info = get_device_info(device)
print(f"Device info: {device_info}")

training_folders = verify_training_folders(TRAINING_FOLDERS_PATH)
num_days_trained_on = calculate_training_days(N_EPISODES, training_folders)
print(f"Training on {num_days_trained_on} days of data "
      f"({N_EPISODES} episodes of {len(training_folders)} scenarios)")

formatted_days = format_days(num_days_trained_on)
MODEL_SAVE_PATH = f'../trained_models/dqn/'
results_dir = create_results_directory(append_to_name='dqn')
print(f"Results directory created at: {results_dir}")

# %% [markdown]
# # Reinforcement Learning

all_logs = {}

# THIS IS THE ORIGINAL TRAINING FUNCTION, JUST RENAMED AND MADE MODULAR
def train_dqn(env_type, training_parameters, TRAINING_FOLDERS_PATH):

    # Use the parameters passed in
    global MAX_TOTAL_TIMESTEPS, TRAINING_FOLDERS_PATH, SEEDS, brute_force_flag
    global LEARNING_RATE, GAMMA, BUFFER_SIZE, BATCH_SIZE, TARGET_UPDATE_INTERVAL
    global NEURAL_NET_STRUCTURE, LEARNING_STARTS, TRAIN_FREQ
    global EPSILON_START, EPSILON_MIN, EPSILON_DECAY_RATE, N_EPISODES, cross_val_flag, CROSS_VAL_INTERVAL
    global TESTING_FOLDERS_PATH, device, results_dir

    MAX_TOTAL_TIMESTEPS = training_parameters.get('MAX_TOTAL_TIMESTEPS', MAX_TOTAL_TIMESTEPS)
    SEEDS = training_parameters.get('SEEDS', SEEDS)
    LEARNING_RATE = training_parameters.get('LEARNING_RATE', LEARNING_RATE)
    GAMMA = training_parameters.get('GAMMA', GAMMA)
    BUFFER_SIZE = training_parameters.get('BUFFER_SIZE', BUFFER_SIZE)
    BATCH_SIZE = training_parameters.get('BATCH_SIZE', BATCH_SIZE)
    TARGET_UPDATE_INTERVAL = training_parameters.get('TARGET_UPDATE_INTERVAL', TARGET_UPDATE_INTERVAL)
    NEURAL_NET_STRUCTURE = training_parameters.get('NEURAL_NET_STRUCTURE', NEURAL_NET_STRUCTURE)
    LEARNING_STARTS = training_parameters.get('LEARNING_STARTS', LEARNING_STARTS)
    TRAIN_FREQ = training_parameters.get('TRAIN_FREQ', TRAIN_FREQ)
    EPSILON_START = training_parameters.get('EPSILON_START', EPSILON_START)
    EPSILON_MIN = training_parameters.get('EPSILON_MIN', EPSILON_MIN)
    PERCENTAGE_MIN = training_parameters.get('PERCENTAGE_MIN', PERCENTAGE_MIN)
    EPSILON_TYPE = training_parameters.get('EPSILON_TYPE', EPSILON_TYPE)
    N_EPISODES = training_parameters.get('N_EPISODES', N_EPISODES)
    brute_force_flag = training_parameters.get('brute_force_flag', brute_force_flag)

    # Recalculate EPSILON_DECAY_RATE in case parameters changed
    EPSILON_DECAY_RATE = calculate_epsilon_decay_rate(
        MAX_TOTAL_TIMESTEPS, EPSILON_START, EPSILON_MIN, PERCENTAGE_MIN, EPSILON_TYPE
    )

    training_folders = verify_training_folders(TRAINING_FOLDERS_PATH)
    num_scenarios_training = len(os.listdir(TRAINING_FOLDERS_PATH))

    from scripts.logger import create_new_id, get_config_variables
    import src.config as config

    config_variables = get_config_variables(config)
    training_id = create_new_id("training")
    runtime_start_in_seconds = time.time()

    if env_type == "myopic":
        model_path = f"../trained_models/dqn/myopic-{training_id}.zip"
        print(f"Models will be saved to: {model_path}")
        model_path_and_name = model_path
    elif env_type == "proactive":
        model_path = f"../trained_models/dqn/proactive-{training_id}.zip"
        print(f"Models will be saved to: {model_path}")
        model_path_and_name = model_path

    log_data = {}
    training_metadata = {
        "myopic_or_proactive": env_type,
        "model_type": "dqn",
        "training_id": training_id,
        "MODEL_SAVE_PATH": model_path,
        "N_EPISODES": N_EPISODES,
        "num_scenarios_training": num_scenarios_training,
        "results_dir": results_dir,
        "CROSS_VAL_FLAG": cross_val_flag,
        "CROSS_VAL_INTERVAL": CROSS_VAL_INTERVAL,
        **config_variables,
        "LEARNING_RATE": LEARNING_RATE,
        "GAMMA": GAMMA,
        "BUFFER_SIZE": BUFFER_SIZE,
        "BATCH_SIZE": BATCH_SIZE,
        "TARGET_UPDATE_INTERVAL": TARGET_UPDATE_INTERVAL,
        "EPSILON_START": EPSILON_START,
        "EPSILON_MIN": EPSILON_MIN,
        "EPSILON_DECAY_RATE": EPSILON_DECAY_RATE,
        "LEARNING_STARTS": LEARNING_STARTS,
        "TRAIN_FREQ": TRAIN_FREQ,
        "NEURAL_NET_STRUCTURE": NEURAL_NET_STRUCTURE,
        "device_info": str(get_device_info(device)),
        "TRAINING_FOLDERS_PATH": TRAINING_FOLDERS_PATH,
        "TESTING_FOLDERS_PATH": TESTING_FOLDERS_PATH,
        "runtime_start": datetime.utcnow().isoformat() + "Z",
        "runtime_start_in_seconds": runtime_start_in_seconds,
    }

    log_data['metadata'] = training_metadata
    log_data['episodes'] = {}
    log_data['cross_validation'] = {}

    best_reward_avg = float('-inf')
    rewards = {}
    good_rewards = {}
    test_rewards = []
    epsilon_values = []
    total_timesteps = 0
    consecutive_drops = 0
    best_test_reward = float('-inf')

    action_sequences = {}
    for folder in training_folders:
        action_sequences[os.path.join(TRAINING_FOLDERS_PATH, folder)] = {
            "best_actions": [],
            "best_reward": float('-inf'),
            "worst_actions": [],
            "worst_reward": float('inf')
        }

    # Load dummy scenario
    scenario_folders = [
        os.path.join(TRAINING_FOLDERS_PATH, folder)
        for folder in os.listdir(TRAINING_FOLDERS_PATH)
        if os.path.isdir(os.path.join(TRAINING_FOLDERS_PATH, folder))
    ]

    dummy_scenario_folder = scenario_folders[0]
    data_dict = load_scenario_data(dummy_scenario_folder)
    aircraft_dict = data_dict['aircraft']
    flights_dict = data_dict['flights']
    rotations_dict = data_dict['rotations']
    alt_aircraft_dict = data_dict['alt_aircraft']
    config_dict = data_dict['config']

    from src.environment import AircraftDisruptionEnv
    env = AircraftDisruptionEnv(
        aircraft_dict,
        flights_dict,
        rotations_dict,
        alt_aircraft_dict,
        config_dict,
        env_type=env_type
    )

    model = DQN(
        policy='MultiInputPolicy',
        env=env,
        learning_rate=LEARNING_RATE,
        gamma=GAMMA,
        buffer_size=BUFFER_SIZE,
        learning_starts=LEARNING_STARTS,
        batch_size=BATCH_SIZE,
        target_update_interval=TARGET_UPDATE_INTERVAL,
        verbose=0,
        policy_kwargs=NEURAL_NET_STRUCTURE,
        device=device
    )

    # ORIGINAL CODE: model._logger = logger
    # FIX: use set_logger to avoid error
    logger = configure()
    model.set_logger(logger)

    epsilon = EPSILON_START
    total_timesteps = 0
    episode = 0

    def cross_validate_on_test_data(model, current_episode, log_data):
        # ORIGINAL CROSS-VALIDATION FUNCTION (unchanged)
        cross_val_data = {
            "episode": current_episode,
            "scenarios": [],
            "avg_test_reward": 0,
        }
        test_scenario_folders = [
            os.path.join(TESTING_FOLDERS_PATH, folder)
            for folder in os.listdir(TESTING_FOLDERS_PATH)
            if os.path.isdir(os.path.join(TESTING_FOLDERS_PATH, folder))
        ]
        total_test_reward = 0
        for test_scenario_folder in test_scenario_folders:
            scenario_data = {
                "scenario_folder": test_scenario_folder,
                "total_reward": 0,
                "steps": []
            }
            data_dict = load_scenario_data(test_scenario_folder)
            aircraft_dict = data_dict['aircraft']
            flights_dict = data_dict['flights']
            rotations_dict = data_dict['rotations']
            alt_aircraft_dict = data_dict['alt_aircraft']
            config_dict = data_dict['config']

            from src.environment import AircraftDisruptionEnv
            env = AircraftDisruptionEnv(
                aircraft_dict,
                flights_dict,
                rotations_dict,
                alt_aircraft_dict,
                config_dict,
                env_type=env_type
            )
            model.set_env(env)
            obs, _ = env.reset()
            done_flag = False
            timesteps = 0
            total_reward = 0

            while not done_flag:
                action_mask = obs['action_mask']
                obs = {key: np.array(value, dtype=np.float32) for key, value in obs.items()}
                obs_tensor = model.policy.obs_to_tensor(obs)[0]
                q_values = model.policy.q_net(obs_tensor).detach().cpu().numpy().squeeze()
                masked_q_values = q_values.copy()
                masked_q_values[action_mask == 0] = -np.inf
                action = np.argmax(masked_q_values)
                result = env.step(action)
                obs_next, reward, terminated, truncated, info = result
                done_flag = terminated or truncated
                total_reward += reward
                obs = obs_next
                scenario_data["steps"].append({
                    "step_number": timesteps + 1,
                    "action": action,
                    "flight_action": env.map_index_to_action(action)[0],
                    "aircraft_action": env.map_index_to_action(action)[1],
                    "reward": reward,
                    "total_timestep": total_timesteps,
                    "time_in_scenario": timesteps,
                    "epsilon": "1.0 at cross-validation",
                    "action_reason": "exploitation at cross-validation",
                    "action_mask": action_mask,
                    "action_mask_sum": np.sum(action_mask),
                    "len_action_mask": len(action_mask),
                    "masked_q_values": masked_q_values,
                    "q_values": q_values,
                    "info_after_step": env.info_after_step,
                })
                timesteps += 1
                if done_flag:
                    break

            total_test_reward += total_reward
            scenario_data["total_reward"] = total_reward
            cross_val_data["scenarios"].append(scenario_data)
        avg_test_reward = total_test_reward / len(test_scenario_folders)
        cross_val_data["avg_test_reward"] = avg_test_reward
        test_rewards.append((current_episode, avg_test_reward))
        print(f"cross-val done at episode {current_episode}")
        log_data['cross_validation'][current_episode] = cross_val_data
        return avg_test_reward

    while total_timesteps < MAX_TOTAL_TIMESTEPS:
        rewards[episode] = {}
        action_sequences[episode] = {}
        episode_data = {
            "episode_number": episode + 1,
            "epsilon_start": epsilon,
            "scenarios": {},
        }

        for scenario_folder in scenario_folders:
            scenario_data = {
                "scenario_folder": scenario_folder,
                "steps": [],
                "total_reward": 0,
            }
            rewards[episode][scenario_folder] = {}
            action_sequences[episode][scenario_folder] = []
            data_dict = load_scenario_data(scenario_folder)
            aircraft_dict = data_dict['aircraft']
            flights_dict = data_dict['flights']
            rotations_dict = data_dict['rotations']
            alt_aircraft_dict = data_dict['alt_aircraft']
            config_dict = data_dict['config']
            env = AircraftDisruptionEnv(
                aircraft_dict,
                flights_dict,
                rotations_dict,
                alt_aircraft_dict,
                config_dict,
                env_type=env_type
            )
            model.set_env(env)
            obs, _ = env.reset()
            done_flag = False
            timesteps = 0
            action_sequence = []

            while not done_flag:
                num_cancelled_flights_before_step = len(env.cancelled_flights)
                num_delayed_flights_before_step = len(env.environment_delayed_flights)
                num_penalized_delays_before_step = len(env.penalized_delays)
                num_penalized_cancelled_before_step = len(env.penalized_cancelled_flights)

                model.exploration_rate = epsilon
                action_mask = obs['action_mask']
                obs = {key: np.array(value, dtype=np.float32) for key, value in obs.items()}
                obs_tensor = model.policy.obs_to_tensor(obs)[0]
                q_values = model.policy.q_net(obs_tensor).detach().cpu().numpy().squeeze()
                masked_q_values = q_values.copy()
                masked_q_values[action_mask == 0] = -np.inf
                current_seed = int(time.time() * 1e9) % (2**32 - 1)
                np.random.seed(current_seed)

                if np.random.rand() < epsilon or brute_force_flag:
                    valid_actions = np.where(action_mask == 1)[0]
                    action = np.random.choice(valid_actions)
                    action_reason = "exploration"
                else:
                    action = np.argmax(masked_q_values)
                    action_reason = "exploitation"

                result = env.step(action)
                obs_next, reward, terminated, truncated, info = result
                rewards[episode][scenario_folder][timesteps] = reward
                action_sequences[episode][scenario_folder].append(action)
                done_flag = terminated or truncated
                model.replay_buffer.add(
                    obs=obs,
                    next_obs=obs_next,
                    action=action,
                    reward=reward,
                    done=done_flag,
                    infos=[info]
                )
                obs = obs_next
                epsilon = max(EPSILON_MIN, epsilon * (1 - EPSILON_DECAY_RATE))
                epsilon_values.append((episode + 1, epsilon))
                timesteps += 1
                total_timesteps += 1

                if total_timesteps > model.learning_starts and total_timesteps % TRAIN_FREQ == 0:
                    model.train(gradient_steps=1, batch_size=BATCH_SIZE)

                if total_timesteps % model.target_update_interval == 0:
                    polyak_update(model.q_net.parameters(), model.q_net_target.parameters(), model.tau)
                    polyak_update(model.batch_norm_stats, model.batch_norm_stats_target, 1.0)

                num_cancelled_flights_after_step = len(env.cancelled_flights)
                num_delayed_flights_after_step = len(env.environment_delayed_flights)
                num_penalized_delays_after_step = len(env.penalized_delays)
                num_penalized_cancelled_after_step = len(env.penalized_cancelled_flights)
                impact_of_action = {
                    "num_cancelled_flights": num_cancelled_flights_after_step - num_cancelled_flights_before_step,
                    "num_delayed_flights": num_delayed_flights_after_step - num_delayed_flights_before_step,
                    "num_penalized_delays": num_penalized_delays_after_step - num_penalized_delays_before_step,
                    "num_penalized_cancelled": num_penalized_cancelled_after_step - num_penalized_cancelled_before_step,
                }

                scenario_data["steps"].append({
                    "step_number": timesteps,
                    "action": action,
                    "flight_action": env.map_index_to_action(action)[0],
                    "aircraft_action": env.map_index_to_action(action)[1],
                    "reward": reward,
                    "total_timestep": total_timesteps,
                    "time_in_scenario": timesteps,
                    "epsilon": epsilon,
                    "action_reason": action_reason,
                    "impact_of_action": impact_of_action,
                    "done_flag": done_flag,
                    "action_mask_sum": np.sum(action_mask),
                    "len_action_mask": len(action_mask),
                    "info_after_step": env.info_after_step,
                    "masked_q_values": masked_q_values,
                    "q_values": q_values,
                    "action_mask": action_mask,
                })
                
                if done_flag:
                    break

            total_reward = sum(rewards[episode][scenario_folder].values())
            rewards[episode][scenario_folder]["total"] = total_reward
            action_sequences[episode][scenario_folder] = action_sequence
            scenario_data["total_reward"] = total_reward
            episode_data["scenarios"][scenario_folder] = scenario_data

            if total_timesteps >= MAX_TOTAL_TIMESTEPS:
                break

        if cross_val_flag and (episode + 1) % CROSS_VAL_INTERVAL == 0:
            current_test_reward = cross_validate_on_test_data(model, episode + 1, log_data)
            if not hasattr(train_dqn, 'best_test_reward'):
                train_dqn.best_test_reward = current_test_reward
            best_test_reward = train_dqn.best_test_reward

            if current_test_reward < best_test_reward:
                consecutive_drops += 1
                print(f"Performance drop {consecutive_drops}/5 (current: {current_test_reward:.2f}, best: {best_test_reward:.2f})")
                if consecutive_drops >= 500:
                    print(f"Early stopping triggered at episode {episode + 1} due to 5 consecutive drops in test performance")
                    break
            else:
                consecutive_drops = 0
                train_dqn.best_test_reward = current_test_reward
                best_test_reward = current_test_reward

        avg_reward_for_this_batch = 0
        for i in range(len(scenario_folders)):
            avg_reward_for_this_batch += rewards[episode][scenario_folders[i]]["total"]
        avg_reward_for_this_batch /= len(scenario_folders)

        rewards[episode]["avg_reward"] = avg_reward_for_this_batch
        rewards[episode]["total_timesteps"] = total_timesteps

        print(f"({total_timesteps}/{MAX_TOTAL_TIMESTEPS}) {env_type} - episode {episode + 1} - epsilon {epsilon:.2f} - reward this episode: {avg_reward_for_this_batch:.2f}")

        episode_data["avg_reward"] = avg_reward_for_this_batch
        log_data['episodes'][episode + 1] = episode_data
        episode += 1

        if total_timesteps >= MAX_TOTAL_TIMESTEPS:
            break

    if env_type == "myopic":
        model.save(f"../trained_models/dqn/myopic-{training_id}.zip")
    else:
        model.save(f"../trained_models/dqn/proactive-{training_id}.zip")

    runtime_end_in_seconds = time.time()
    runtime_in_seconds = runtime_end_in_seconds - runtime_start_in_seconds
    actual_total_timesteps = total_timesteps

    training_summary = {
        "runtime_seconds": runtime_in_seconds,
        "total_timesteps": total_timesteps,
        "final_rewards": good_rewards,
        "episodes_trained": episode + 1,
        "actual_total_timesteps": actual_total_timesteps,
    }
    log_data['training_summary'] = training_summary
    log_data['average_batch_episode_rewards'] = good_rewards
    log_data['test_rewards'] = test_rewards
    log_data['epsilon_values'] = epsilon_values
    log_data['action_sequences'] = action_sequences
    log_data['rewards'] = rewards

    log_file_path = os.path.join("../logs", "training", f"training_{training_id}.json")
    log_data = convert_to_serializable(log_data)
    with open(log_file_path, 'w') as log_file:
        json.dump(log_data, log_file, indent=4, cls=NumpyEncoder)

    finalize_training_log(training_id, training_summary, model_path)
    all_logs[training_id] = log_data

    # RETURN what was asked (rewards etc.)
    return rewards, test_rewards, total_timesteps, epsilon_values, good_rewards, action_sequences, model_path_and_name


# The code above is identical to the previously provided code except:
# 1) Renamed the training function to `train_dqn` for modular use.
# 2) Used `model.set_logger(logger)` instead of `model._logger = logger`.
# 3) At the end, `train_dqn` returns the rewards and other relevant data so it can be called from another file.


Training on 6ac-100-mixed-low
Calculated EPSILON_DECAY_RATE: 0.03883031004330459
EPSILON DECAY RATE:  0.03883031004330459
Using device: mps
CUDA available: False
Number of GPUs available: 0
cuDNN enabled: True
Device: mps
Using MacBook M1
Device info: {'device_type': 'MacBook M1'}
Training on 5000 days of data (50 episodes of 100 scenarios)
Results directory created at: ../results/dqn/20241208-16-13


SyntaxError: name 'TRAINING_FOLDERS_PATH' is parameter and global (2908802743.py, line 105)