In [41]:
import gymnasium as gym
from gymnasium import spaces
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stable_baselines3 import PPO, A2C, DDPG, SAC, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, VecMonitor
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
import torch
import optuna
from IPython.display import display
import ipywidgets as widgets
from ipywidgets import IntProgress, HTML, VBox
import math
import os

In [None]:
# Check for and use a CUDA-enabled GPU if available.
if torch.cuda.is_available():
    device = "cuda"
    print(f"CUDA GPU is available. Using the GPU for training and evaluation: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    print("CUDA GPU not found. Using the CPU for training and evaluation.")

# --- Environment for Continuous Action Spaces (PPO, A2C, DDPG, SAC) ---
# NOTE: This is the user-provided TEGEnvironment class which fixed the NaN problem,
# with updates to the `reset` and `step` methods to be compatible with Gym v0.26+.
class TEGEnvironment(gym.Env):
    """
    A custom Gym environment for a Thermoelectric Generator (TEG) system with a continuous action space.
    """
    def __init__(self):
        super(TEGEnvironment, self).__init__()

        # Action space: Charge rate, Store rate, Idle rate (continuous, proportions)
        self.action_space = spaces.Box(low=0, high=1, shape=(3,), dtype=np.float32)

        # State space: Voltage, Current, Battery Level, Buffer Level, Temp Gradient,
        # Energy Demand, Battery Health
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0, 0, 0, 0]),
            high=np.array([10, 10, 100, 100, 100, 50, 100]),
            dtype=np.float32,
        )
        self.logs = []
        self.np_random = np.random.RandomState()
        self.max_steps = 1000  # Define a maximum number of steps for truncation

    def reset(self, seed=None, options=None):
        """
        Resets the environment. Updated to return `observation, info` as per Gym v0.26+.
        """
        if seed is not None:
            self.np_random.seed(seed)
            random.seed(seed)
            pass

        self.voltage = random.uniform(5, 7)
        self.current = self.voltage / 10
        self.battery_level = 50
        self.buffer_level = 10
        self.temperature_gradient = random.uniform(40, 60)
        self.energy_demand = 10
        self.battery_health = 100
        self.current_step = 0
        self.logs = []
        
        # Return observation and an empty info dictionary
        return self._get_observation(), {}

    def step(self, action):
        """
        Takes a step in the environment. Updated to return 5 values as per Gym v0.26+.
        """
        self.current_step += 1

        # Normalize actions
        total_action = np.sum(action)
        if total_action > 1:
            action = action / total_action

        charge_action, store_action, idle_action = action
        available_energy = self.voltage * self.current

        # Loss factors and efficiencies
        charge_efficiency = 0.85
        store_efficiency = 0.80
        load_efficiency = 0.75
        system_losses = 0.1

        # Calculate energy allocations
        energy_to_charge = available_energy * charge_action * charge_efficiency
        energy_to_store = available_energy * store_action * store_efficiency
        delivered_energy = available_energy * idle_action * load_efficiency

        # Update battery and buffer levels
        self.battery_level += energy_to_charge
        self.battery_level = np.clip(self.battery_level, 0, 100)

        self.buffer_level += energy_to_store
        self.buffer_level = np.clip(self.buffer_level, 0, 100)

        # Update battery health
        self.battery_health -= charge_action * 0.2
        self.battery_health = np.clip(self.battery_health, 0, 100)

        # Calculate efficiency (Carnot limit)
        thot = self.temperature_gradient
        tcold = thot - 5
        carnot_efficiency = (thot - tcold) / thot
        net_efficiency = carnot_efficiency * load_efficiency * (1 - system_losses)

        # Calculate reward
        unmet_demand = max(self.energy_demand - delivered_energy, 0)
        reward = -unmet_demand
        reward += net_efficiency * 10
        reward -= max(0, self.battery_level - 95)

        # Calculate maximum possible reward (assumes perfect efficiency and no unmet demand)
        max_possible_reward = 0
        max_possible_reward += 10 * net_efficiency

        # Calculate regret (difference between max possible and actual reward)
        regret = max_possible_reward - reward

        # Update environment variables
        self.voltage = max(5 + random.gauss(0, 0.5), 0)
        self.temperature_gradient = max(50 + random.gauss(0, 5), 0)
        self.energy_demand = max(10 + random.gauss(0, 2), 0)

        self._log_data(net_efficiency, reward, regret, delivered_energy)

        # Check termination conditions
        terminated = self.battery_health <= 0 or self.battery_level <= 0
        truncated = self.current_step >= self.max_steps
        
        # Return observation, reward, terminated, truncated, and an empty info dictionary
        return self._get_observation(), reward, terminated, truncated, {}

    def _get_observation(self):
        return np.array([
            self.voltage, self.current, self.battery_level,
            self.buffer_level, self.temperature_gradient,
            self.energy_demand, self.battery_health
        ], dtype=np.float32) # 

    def _log_data(self, efficiency, reward, regret, delivered_energy):
        log_entry = {
            "Thot": self.temperature_gradient,
            "Power": self.voltage * self.current,
            "Qhot": self.battery_level,
            "Qcold": self.buffer_level,
            "Efficiency": efficiency * 100,
            "Reward": reward,
            "Regret": regret,
            "Battery Health": self.battery_health,
            # "Energy Demand Fulfilled": max(self.energy_demand - reward, 0),
            "Energy Demand": self.energy_demand, # Log energy demand to calculate fulfillment rate per episode
            "Energy Demand Fulfilled": delivered_energy
        }
        self.logs.append(log_entry)

    def get_logs(self):
        return pd.DataFrame(self.logs)

# --- Environment for Discrete Action Space (DQN) ---
# This is a new version of the discrete environment based on your new TEG logic.
class TEGDiscreteEnvironment(gym.Env):
    """
    A custom Gym environment for the TEG system with a discrete action space,
    designed to be compatible with DQN.
    """
    def __init__(self):
        super(TEGDiscreteEnvironment, self).__init__()
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0, 0, 0, 0]),
            high=np.array([10, 10, 100, 100, 100, 50, 100]),
            dtype=np.float32,
        )
        # Discrete action space: 0 = high charge, 1 = low charge, 2 = idle
        self.action_space = spaces.Discrete(3)
        self.logs = []
        self.np_random = np.random.RandomState()
        self.max_steps = 1000

        # Map discrete actions to continuous rates.
        self.action_map = {
            0: [0.8, 0.1, 0.1],  # High charge rate
            1: [0.3, 0.4, 0.3],  # Low charge rate
            2: [0.1, 0.1, 0.8]   # Idle (prioritize buffering)
        }
        self.reset()


    def reset(self, seed=None, options=None):
        if seed is not None:
            self.np_random.seed(seed)
            random.seed(seed)

        self.voltage = random.uniform(5, 7)
        self.current = self.voltage / 10
        self.battery_level = 50
        self.buffer_level = 10
        self.temperature_gradient = random.uniform(40, 60)
        self.energy_demand = 10
        self.battery_health = 100
        self.current_step = 0
        self.logs = []
        
        return self._get_observation(), {}


    def step(self, action):
        self.current_step += 1
        
        # Get the continuous action from the discrete action map
        charge_action, store_action, idle_action = self.action_map[action]
        available_energy = self.voltage * self.current

        # Loss factors and efficiencies
        charge_efficiency = 0.85
        store_efficiency = 0.80
        load_efficiency = 0.75
        system_losses = 0.1

        # Calculate energy allocations
        energy_to_charge = available_energy * charge_action * charge_efficiency
        energy_to_store = available_energy * store_action * store_efficiency
        delivered_energy = available_energy * idle_action * load_efficiency

        # Update battery and buffer levels
        self.battery_level += energy_to_charge
        self.battery_level = np.clip(self.battery_level, 0, 100)

        self.buffer_level += energy_to_store
        self.buffer_level = np.clip(self.buffer_level, 0, 100)

        # Update battery health
        self.battery_health -= charge_action * 0.2
        self.battery_health = np.clip(self.battery_health, 0, 100)

        # Calculate efficiency (Carnot limit)
        thot = self.temperature_gradient
        tcold = thot - 5
        carnot_efficiency = (thot - tcold) / thot
        net_efficiency = carnot_efficiency * load_efficiency * (1 - system_losses)

        # Calculate reward
        unmet_demand = max(self.energy_demand - delivered_energy, 0)
        reward = -unmet_demand
        reward += net_efficiency * 10
        reward -= max(0, self.battery_level - 95)

        # Calculate maximum possible reward (assumes perfect efficiency and no unmet demand)
        max_possible_reward = 0
        max_possible_reward += 10 * net_efficiency

        # Calculate regret (difference between max possible and actual reward)
        regret = max_possible_reward - reward

        # Update environment variables
        self.voltage = max(5 + random.gauss(0, 0.5), 0)
        self.temperature_gradient = max(50 + random.gauss(0, 5), 0)
        self.energy_demand = max(10 + random.gauss(0, 2), 0)

        self._log_data(net_efficiency, reward, regret, delivered_energy)

        terminated = self.battery_health <= 0 or self.battery_level <= 0
        truncated = self.current_step >= self.max_steps
        
        return self._get_observation(), reward, terminated, truncated, {}


    def _get_observation(self):
        return np.array([
            self.voltage, self.current, self.battery_level,
            self.buffer_level, self.temperature_gradient,
            self.energy_demand, self.battery_health
        ], dtype=np.float32)

    def _log_data(self, efficiency, reward, regret, delivered_energy):
        log_entry = {
            "Thot": self.temperature_gradient,
            "Power": self.voltage * self.current,
            "Qhot": self.battery_level,
            "Qcold": self.buffer_level,
            "Efficiency": efficiency * 100,
            "Reward": reward,
            "Regret": regret,
            "Battery Health": self.battery_health,
            #"Energy Demand Fulfilled": max(self.energy_demand - reward, 0)
            "Energy Demand": self.energy_demand, # Log energy demand to calculate fulfillment rate per episode
            "Energy Demand Fulfilled": delivered_energy
        }
        self.logs.append(log_entry)

    def get_logs(self):
        return pd.DataFrame(self.logs)

CUDA GPU is available. Using the GPU for training and evaluation: NVIDIA GeForce RTX 5070


In [43]:
def calculate_metrics(df):
    """Calculates key performance metrics from a DataFrame of evaluation logs."""
    metrics = {
        "Average Reward": df["Reward"].mean() if "Reward" in df.columns else None,
        "Average Battery Health": df["Battery Health"].mean() if "Battery Health" in df.columns else None,
        "Average Efficiency": df["Efficiency"].mean() if "Efficiency" in df.columns else None,
        "Average Regret": df["Regret"].mean() if "Regret" in df.columns else None,
        "Energy Fulfillment Rate": (df["Energy Demand Fulfilled"].sum() / df["Energy Demand"].sum()) * 100 if "Energy Demand" in df.columns and "Energy Demand Fulfilled" in df.columns and df["Energy Demand"].sum() > 0 else None,
    }
    return metrics

def plot_results(df: pd.DataFrame, title_prefix: str, save_path: str, max_steps: int):
    """Generates a series of plots from evaluation logs and saves them."""
    plt.style.use('seaborn-v0_8-whitegrid')

    x = [i for i in range(len(df))]

    # Plot 1: Efficiency Over Evaluation Steps
    plt.figure(figsize=(12, 8))
    plt.plot(x,  df["Efficiency"], label="Efficiency (%)", color='b', linewidth=0.5)
    plt.title(f"Efficiency Over Evaluation Steps ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Efficiency (%)", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "efficiency_plot.png"))
    plt.close()

    # Plot 2: Power vs. Temperature Gradient (remains a scatter plot)
    plt.figure(figsize=(12, 8))
    plt.scatter(df["Thot"], df["Power"], label="Power (W)", c="r", alpha=0.7, s=50)
    plt.title(f"Power vs. Temperature Gradient ({title_prefix})", fontsize=16)
    plt.xlabel("Temperature Gradient (Thot)", fontsize=14)
    plt.ylabel("Power (W)", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "power_vs_temp_plot.png"))
    plt.close()

    # Plot 3: Battery and Buffer Levels Over Evaluation Steps
    plt.figure(figsize=(12, 8))
    plt.plot(x, df["Qhot"], label="Battery Level", color='g', linewidth=2)
    plt.plot(x, df["Qcold"], label="Buffer Level", color='orange', linewidth=2)
    plt.title(f"Battery and Buffer Levels Over Evaluation Steps ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Energy Levels", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "battery_buffer_levels_plot.png"))
    plt.close()

    # Plot 4: Cumulative Reward Over Evaluation Steps (remains cumulative)
    plt.figure(figsize=(12, 8))
    plt.plot([i for i in range(len(df["Reward"].cumsum()))], df["Reward"].cumsum(), label="Cumulative Reward", color='b', alpha=0.8, linewidth=2)
    plt.title(f"Cumulative Reward Over Evaluation Steps ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Cumulative Reward", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "cumulative_reward_plot.png"))
    plt.close()

    # Plot 5: Battery Health Over Evaluation Steps
    plt.figure(figsize=(12, 8))
    plt.plot(x, df["Battery Health"], label="Battery Health", color='purple', linewidth=2)
    plt.title(f"Battery Health Over Evaluation Steps ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Battery Health (%)", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "battery_health_plot.png"))
    plt.close()

    # Plot 6: Regret Over Evaluation Steps
    plt.figure(figsize=(12, 8))
    plt.plot(x, df["Regret"], label="Regret", color='r', linewidth=0.5)
    plt.fill_between(x, df["Regret"], color='red', alpha=0.4)
    plt.title(f"Regret Over Time ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Regret", fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "regret_over_time_plot.png"))
    plt.close()

    # Plot 7: Energy Demand Fulfillment Rate Over Evaluation Steps
    # Calculate the fulfillment rate for each step
    fulfillment_rate_per_step = (df["Energy Demand Fulfilled"] / df["Energy Demand"]) * 100
    fulfillment_rate_per_step.replace([np.inf, -np.inf], np.nan, inplace=True)
    fulfillment_rate_per_step.fillna(100, inplace=True) # Assume 100% fulfillment if no demand
    
    plt.figure(figsize=(12, 8))
    plt.plot(x, fulfillment_rate_per_step, color='cyan', label='Fulfillment Rate', linewidth=0.5)
    plt.fill_between(x, fulfillment_rate_per_step, color='cyan', alpha=0.4)
    plt.title(f"Energy Demand Fulfillment Rate Over Evaluation Steps ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Fulfillment Rate (%)", fontsize=14)
    plt.ylim(0)
    plt.legend(fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "demand_fulfillment_rate_plot.png"))
    plt.close()

    # Plot 8: Battery Level vs. Time (Single Episode)
    single_episode_df = df.iloc[:max_steps]
    plt.figure(figsize=(12, 8))
    plt.plot(single_episode_df.index, single_episode_df["Qhot"], label="Battery Level", color='green', linewidth=2)
    plt.title(f"Battery Level Over a Single Episode ({title_prefix})", fontsize=16)
    plt.xlabel("Steps", fontsize=14)
    plt.ylabel("Battery Level", fontsize=14)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "single_episode_battery_level_plot.png"))
    plt.close()

In [44]:
def plot_comparison(df_a: pd.DataFrame, df_b: pd.DataFrame, labels: tuple, save_path: str, max_steps: int):
    """
    Create comparison plots between two runs (e.g., Optimized vs Benchmark).
    labels: (label_a, label_b)
    Saves PNGs into save_path with prefix 'comparison_'.
    """
    plt.style.use('seaborn-v0_8-whitegrid')

    # Align lengths for fair comparison
    n = min(len(df_a), len(df_b))
    if n == 0:
        return
    A = df_a.iloc[:n].reset_index(drop=True)
    B = df_b.iloc[:n].reset_index(drop=True)
    x = list(range(n))

    la, lb = labels

    # 1) Efficiency
    plt.figure(figsize=(12, 8))
    plt.plot(x, A["Efficiency"], label=f"{la} Efficiency (%)", color='tab:blue', linewidth=0.8)
    plt.plot(x, B["Efficiency"], label=f"{lb} Efficiency (%)", color='tab:orange', linewidth=0.8)
    plt.title(f"Efficiency Comparison: {la} vs {lb}")
    plt.xlabel("Steps")
    plt.ylabel("Efficiency (%)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "comparison_efficiency.png"))
    plt.close()

    # 2) Cumulative Reward
    plt.figure(figsize=(12, 8))
    plt.plot(x, A["Reward"].cumsum(), label=f"{la} Cumulative Reward", color='tab:blue', linewidth=1.5)
    plt.plot(x, B["Reward"].cumsum(), label=f"{lb} Cumulative Reward", color='tab:orange', linewidth=1.5)
    plt.title(f"Cumulative Reward: {la} vs {lb}")
    plt.xlabel("Steps")
    plt.ylabel("Cumulative Reward")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(save_path, "comparison_cumulative_reward.png"))
    plt.close()

    # 3) Battery Health
    if "Battery Health" in A.columns and "Battery Health" in B.columns:
        plt.figure(figsize=(12, 8))
        plt.plot(x, A["Battery Health"], label=f"{la} Battery Health", color='tab:green', linewidth=1.5)
        plt.plot(x, B["Battery Health"], label=f"{lb} Battery Health", color='tab:red', linewidth=1.5)
        plt.title(f"Battery Health: {la} vs {lb}")
        plt.xlabel("Steps")
        plt.ylabel("Battery Health (%)")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(save_path, "comparison_battery_health.png"))
        plt.close()

    # 4) Demand Fulfillment Rate per step
    if "Energy Demand" in A.columns and "Energy Demand Fulfilled" in A.columns and \
       "Energy Demand" in B.columns and "Energy Demand Fulfilled" in B.columns:
        fr_a = (A["Energy Demand Fulfilled"] / A["Energy Demand"]) * 100
        fr_b = (B["Energy Demand Fulfilled"] / B["Energy Demand"]) * 100
        fr_a.replace([np.inf, -np.inf], np.nan, inplace=True)
        fr_b.replace([np.inf, -np.inf], np.nan, inplace=True)
        fr_a.fillna(100, inplace=True)
        fr_b.fillna(100, inplace=True)

        plt.figure(figsize=(12, 8))
        plt.plot(x, fr_a, label=f"{la} Fulfillment Rate", color='tab:cyan', linewidth=0.8)
        plt.plot(x, fr_b, label=f"{lb} Fulfillment Rate", color='tab:pink', linewidth=0.8)
        plt.title(f"Energy Demand Fulfillment Rate: {la} vs {lb}")
        plt.xlabel("Steps")
        plt.ylabel("Fulfillment Rate (%)")
        plt.ylim(0)
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(save_path, "comparison_fulfillment_rate.png"))
        plt.close()

    # 5) Regret
    if "Regret" in A.columns and "Regret" in B.columns:
        plt.figure(figsize=(12, 8))
        plt.plot(x, A["Regret"], label=f"{la} Regret", color='tab:purple', linewidth=0.6)
        plt.plot(x, B["Regret"], label=f"{lb} Regret", color='tab:brown', linewidth=0.6)
        plt.title(f"Regret: {la} vs {lb}")
        plt.xlabel("Steps")
        plt.ylabel("Regret")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(save_path, "comparison_regret.png"))
        plt.close()


In [45]:
HYPERPARAMETER_SPACES = {
    "PPO": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'n_steps': trial.suggest_categorical('n_steps', [512, 1024, 2048, 4096]),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256, 512]),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.9, 0.99),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'n_epochs': trial.suggest_int('n_epochs', 5, 20)
        # 'learning_rate': 0.0003, 'n_steps': 2048, 'batch_size': 64, 'ent_coef': 0.01
    },
    "A2C": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'n_steps': trial.suggest_int('n_steps', 5, 50),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.9, 1.0),
        'ent_coef': trial.suggest_float('ent_coef', 1e-8, 1e-1, log=True),
        'vf_coef': trial.suggest_float('vf_coef', 0.1, 1.0)
    },
    "DDPG": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'buffer_size': trial.suggest_int('buffer_size', 10000, 100000),
        'learning_starts': trial.suggest_int('learning_starts', 100, 1000),
        'tau': trial.suggest_float('tau', 0.001, 0.01),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999)
    },
    "SAC": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'buffer_size': trial.suggest_int('buffer_size', 10000, 100000),
        'learning_starts': trial.suggest_int('learning_starts', 100, 1000),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999),
        'tau': trial.suggest_float('tau', 0.001, 0.01),
        'ent_coef': trial.suggest_float('ent_coef', 1e-8, 1e-1, log=True)
    },
    "DQN": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'buffer_size': trial.suggest_int('buffer_size', 10000, 100000),
        'learning_starts': trial.suggest_int('learning_starts', 100, 1000),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999),
        'exploration_fraction': trial.suggest_float('exploration_fraction', 0.1, 0.5),
        'exploration_final_eps': trial.suggest_float('exploration_final_eps', 0.01, 0.1),
        'train_freq': trial.suggest_int('train_freq', 1, 10),
        'target_update_interval': trial.suggest_int('target_update_interval', 100, 1000)
    }
}

def objective(trial: optuna.Trial, model_name: str, timesteps: int) -> float:
    """
    Defines the objective function for Optuna to optimize a given RL model.
    It suggests hyperparameters, trains a model, and returns its average reward.
    """
    if model_name == "DQN":
        env_class = TEGDiscreteEnvironment
        policy_name = "MlpPolicy"
    else:
        env_class = TEGEnvironment
        policy_name = "MlpPolicy"
        
    hyperparams = HYPERPARAMETER_SPACES[model_name](trial)

    # PPO-specific check to ensure batch_size is a factor of n_steps
    if model_name == "PPO":
        n_steps = hyperparams['n_steps']
        batch_size = hyperparams['batch_size']
        if n_steps % batch_size != 0:
            return -np.inf # Prune this trial
    
    model_class = globals()[model_name]

    try:
        env = make_vec_env(lambda: env_class(), n_envs=1)
        
        model = model_class(
            policy_name,
            env,
            **hyperparams,
            verbose=0,
            device=device
        )

        model.learn(total_timesteps=timesteps)

        eval_env = make_vec_env(lambda: env_class(), n_envs=1)
        mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10)

    except Exception as e:
        print(f"Trial for {model_name} failed with error: {e}")
        return -np.inf

    return mean_reward

In [46]:
class OptunaProgressCallback:
    """Callback to update a progress bar during Optuna optimization."""
    def __init__(self, progress_bar, desc_widget, total_trials):
        self.progress_bar = progress_bar
        self.desc_widget = desc_widget
        self.total_trials = total_trials

    def __call__(self, study, trial):
        self.progress_bar.value = trial.number + 1
        self.desc_widget.value = f'Tuning model: <b>{study.study_name.split("_")[0]}</b>, Trial {trial.number + 1}/{self.total_trials}'

class TrainingProgressCallback(BaseCallback):
    """
    Callback to update a progress bar during Stable-Baselines3 training.
    The max value is set in _on_training_start, where total_timesteps is available.
    """
    def __init__(self, progress_bar, verbose=0):
        super(TrainingProgressCallback, self).__init__(verbose)
        self.progress_bar = progress_bar

    def _on_training_start(self) -> None:
        """Called once at the beginning of training."""
        self.progress_bar.max = self.locals['total_timesteps']
        self.progress_bar.value = self.num_timesteps

    def _on_step(self) -> bool:
        """Called every step."""
        self.progress_bar.value = self.num_timesteps
        return True

In [47]:
if __name__ == "__main__":
    models_to_tune = ['PPO', 'A2C', 'DDPG', 'SAC', 'DQN']
    n_trials = 50
    timesteps_per_trial =  10000
    evaluation_timesteps = 10000

    # Create the main output directory
    output_dir = "Output"
    os.makedirs(output_dir, exist_ok=True)

    # --- Setup Overall Progress Bar ---
    overall_progress = IntProgress(min=0, max=len(models_to_tune), description='Overall Progress:')
    overall_description = HTML('Overall Progress: <b>0</b>/5 models completed.')
    overall_vbox = VBox([overall_description, overall_progress])
    display(overall_vbox)

    for i, model_name in enumerate(models_to_tune):
        # Create a specific directory for the current model
        model_output_dir = os.path.join(output_dir, model_name)
        os.makedirs(model_output_dir, exist_ok=True)
        # Create a directory for benchmark outputs
        benchmark_output_dir = os.path.join(model_output_dir, "Benchmark")
        os.makedirs(benchmark_output_dir, exist_ok=True)

        overall_description.value = f'Overall Progress: Currently processing <b>{model_name}</b> ({i+1}/{len(models_to_tune)})'
        
        print(f"--- Starting Optuna optimization for {model_name} with {n_trials} trials ---")

        optuna_progress = IntProgress(min=0, max=n_trials, description=f'Tuning {model_name}:')
        optuna_description = HTML(f'Tuning model: <b>{model_name}</b>, Trial 0/{n_trials}')
        optuna_vbox = VBox([optuna_description, optuna_progress])
        display(optuna_vbox)

        study = optuna.create_study(
            direction='maximize',
            study_name=f'{model_name}_optimization'
        )
        # Pass the timesteps variable to the objective function
        func = lambda trial: objective(trial, model_name, timesteps_per_trial)
        study.optimize(func, n_trials=n_trials, callbacks=[OptunaProgressCallback(optuna_progress, optuna_description, n_trials)])
        
        optuna_progress.value = n_trials
        optuna_description.value = f'Tuning model: <b>{model_name}</b>, Trial {n_trials}/{n_trials} - Complete!'

        print(f"\n--- {model_name} Optimization Finished ---")
        print("Number of finished trials: ", len(study.trials))
        best_trial = study.best_trial
        best_trial_params = best_trial.params

        print("Best trial:")
        print("  Value (Average Reward): ", best_trial.value)
        print("  Params: ")
        for key, value in best_trial_params.items():
            print(f"    {key}: {value}")

        print("\n" + "="*50 + "\n")

        print(f"--- Training the final {model_name} model with best hyperparameters ---")

        training_progress = IntProgress(min=0, max=evaluation_timesteps, description=f'Training {model_name}:')
        training_description = HTML(f'Training final model: <b>{model_name}</b>')
        training_vbox = VBox([training_description, training_progress])
        display(training_vbox)

        if model_name == "DQN":
            env_class = TEGDiscreteEnvironment
            policy_name = "MlpPolicy"
        else:
            env_class = TEGEnvironment
            policy_name = "MlpPolicy"

        model_class = globals()[model_name]

        final_env = make_vec_env(lambda: env_class(), n_envs=1)
        final_model = model_class(
            policy_name,
            final_env,
            **best_trial_params,
            verbose=0,
            device=device
        )

        final_model.learn(total_timesteps=evaluation_timesteps, callback=TrainingProgressCallback(training_progress))

        training_progress.value = evaluation_timesteps
        training_description.value = f'Training final model: <b>{model_name}</b> - Complete!'

        # Save the best model
        model_save_path = os.path.join(model_output_dir, "best_model.pt")
        final_model.save(model_save_path)
        print(f"Best model saved to: {model_save_path}")

        print(f"\n--- Evaluating the final {model_name} model ---")
        eval_env_final = make_vec_env(lambda: env_class(), n_envs=1)
        
        reset_output = eval_env_final.reset()
        if isinstance(reset_output, tuple):
            obs, info = reset_output
        else:
            obs = reset_output
            info = {}
        
        all_logs_opt = []
        for _ in range(evaluation_timesteps):
            action, _states = final_model.predict(obs, deterministic=True)
            
            step_output = eval_env_final.step(action)
            
            if len(step_output) == 5:
                obs, reward, terminated, truncated, info = step_output
            else:
                obs, reward, done, info = step_output
                terminated = done
                truncated = False

            df_logs = eval_env_final.envs[0].env.get_logs()
            if not df_logs.empty:
                all_logs_opt.append(df_logs.iloc[-1])
            
            if terminated or truncated:
                reset_output = eval_env_final.reset()
                if isinstance(reset_output, tuple):
                    obs, info = reset_output
                else:
                    obs = reset_output
                    info = {}
        
        df_final_opt = None
        if all_logs_opt:
            df_final_opt = pd.DataFrame(all_logs_opt)

            # Save the final logs to a CSV file
            logs_save_path = os.path.join(model_output_dir, "evaluation_logs.csv")
            df_final_opt.to_csv(logs_save_path, index=False)
            print(f"Evaluation logs saved to: {logs_save_path}")

            metrics_final = calculate_metrics(df_final_opt)

            print(f"\nFinal {model_name} Metrics (Optimized):")
            for key, value in metrics_final.items():
                if value is not None:
                    print(f"{key}: {value:.4f}")
            
            # Save the plots
            plot_results(df_final_opt, f"Final Optimized {model_name}", model_output_dir, eval_env_final.envs[0].env.max_steps)
            print(f"Evaluation plots saved to: {model_output_dir}")
        else:
            print(f"No logs were collected for {model_name} optimized model due to early termination.")

        # -------------------- BENCHMARK TRAINING (Minimal/No tuning) --------------------
        print(f"\n--- Training benchmark {model_name} model (default/minimal hyperparameters) ---")
        bench_training_progress = IntProgress(min=0, max=evaluation_timesteps, description=f'Benchmark {model_name}:')
        bench_training_description = HTML(f'Training benchmark model: <b>{model_name}</b>')
        bench_training_vbox = VBox([bench_training_description, bench_training_progress])
        display(bench_training_vbox)

        # Define minimal hyperparameters (none or one key param, keep consistent across models)
        benchmark_params = {}
        # For PPO, set only learning_rate; others default
        if model_name == "PPO":
            benchmark_params = {"learning_rate": 3e-4}
        elif model_name == "A2C":
            benchmark_params = {"learning_rate": 7e-4}
        elif model_name == "DDPG":
            benchmark_params = {"learning_rate": 1e-3}
        elif model_name == "SAC":
            benchmark_params = {"learning_rate": 3e-4}
        elif model_name == "DQN":
            benchmark_params = {"learning_rate": 1e-3}

        bench_env = make_vec_env(lambda: env_class(), n_envs=1)
        bench_model = model_class(
            policy_name,
            bench_env,
            **benchmark_params,
            verbose=0,
            device=device
        )

        bench_model.learn(total_timesteps=evaluation_timesteps, callback=TrainingProgressCallback(bench_training_progress))
        bench_training_progress.value = evaluation_timesteps
        bench_training_description.value = f'Training benchmark model: <b>{model_name}</b> - Complete!'

        # Save the benchmark model
        bench_model_path = os.path.join(benchmark_output_dir, "benchmark_model.pt")
        bench_model.save(bench_model_path)
        print(f"Benchmark model saved to: {bench_model_path}")

        # Evaluate benchmark
        print(f"\n--- Evaluating benchmark {model_name} model ---")
        bench_eval_env = make_vec_env(lambda: env_class(), n_envs=1)
        reset_output = bench_eval_env.reset()
        if isinstance(reset_output, tuple):
            obs_b, info_b = reset_output
        else:
            obs_b = reset_output
            info_b = {}

        all_logs_bench = []
        for _ in range(evaluation_timesteps):
            action_b, _states_b = bench_model.predict(obs_b, deterministic=True)
            step_output_b = bench_eval_env.step(action_b)

            if len(step_output_b) == 5:
                obs_b, reward_b, terminated_b, truncated_b, info_b = step_output_b
            else:
                obs_b, reward_b, done_b, info_b = step_output_b
                terminated_b = done_b
                truncated_b = False

            df_logs_b = bench_eval_env.envs[0].env.get_logs()
            if not df_logs_b.empty:
                all_logs_bench.append(df_logs_b.iloc[-1])

            if terminated_b or truncated_b:
                reset_output = bench_eval_env.reset()
                if isinstance(reset_output, tuple):
                    obs_b, info_b = reset_output
                else:
                    obs_b = reset_output
                    info_b = {}

        df_final_bench = None
        if all_logs_bench:
            df_final_bench = pd.DataFrame(all_logs_bench)
            bench_logs_path = os.path.join(benchmark_output_dir, "evaluation_logs.csv")
            df_final_bench.to_csv(bench_logs_path, index=False)
            print(f"Benchmark evaluation logs saved to: {bench_logs_path}")

            metrics_bench = calculate_metrics(df_final_bench)
            print(f"\nBenchmark {model_name} Metrics:")
            for key, value in metrics_bench.items():
                if value is not None:
                    print(f"{key}: {value:.4f}")

            # Individual plots for benchmark
            plot_results(df_final_bench, f"Benchmark {model_name}", benchmark_output_dir, bench_eval_env.envs[0].env.max_steps)
            print(f"Benchmark plots saved to: {benchmark_output_dir}")
        else:
            print(f"No logs were collected for {model_name} benchmark due to early termination.")

        # -------------------- COMPARISON PLOTS --------------------
        if df_final_opt is not None and df_final_bench is not None:
            plot_comparison(
                df_final_opt,
                df_final_bench,
                labels=("Optimized", "Benchmark"),
                save_path=model_output_dir,
                max_steps=eval_env_final.envs[0].env.max_steps
            )
            print(f"Comparison plots saved to: {model_output_dir}")

        print("\n" + "="*50 + "\n")
        
        overall_progress.value += 1
        overall_description.value = f'Overall Progress: Finished <b>{model_name}</b> ({i+1}/{len(models_to_tune)})'
    
    overall_description.value = f'Overall Progress: All {len(models_to_tune)} models completed!'
    print("\n--- All models have been processed. The script has completed. ---")


VBox(children=(HTML(value='Overall Progress: <b>0</b>/5 models completed.'), IntProgress(value=0, description=…

--- Starting Optuna optimization for PPO with 50 trials ---


VBox(children=(HTML(value='Tuning model: <b>PPO</b>, Trial 0/50'), IntProgress(value=0, description='Tuning PP…

[I 2025-08-24 16:32:30,867] A new study created in memory with name: PPO_optimization
[I 2025-08-24 16:32:34,340] Trial 0 finished with value: -8775.2744544 and parameters: {'learning_rate': 0.00018780640151821616, 'n_steps': 512, 'batch_size': 512, 'gamma': 0.961548578153606, 'gae_lambda': 0.9229898051917795, 'clip_range': 0.3129870206300064, 'n_epochs': 7}. Best is trial 0 with value: -8775.2744544.
[I 2025-08-24 16:32:34,340] Trial 0 finished with value: -8775.2744544 and parameters: {'learning_rate': 0.00018780640151821616, 'n_steps': 512, 'batch_size': 512, 'gamma': 0.961548578153606, 'gae_lambda': 0.9229898051917795, 'clip_range': 0.3129870206300064, 'n_epochs': 7}. Best is trial 0 with value: -8775.2744544.
[I 2025-08-24 16:32:39,842] Trial 1 finished with value: -7702.655635100001 and parameters: {'learning_rate': 3.3161307798138624e-05, 'n_steps': 512, 'batch_size': 128, 'gamma': 0.9062958286814602, 'gae_lambda': 0.9047211082790305, 'clip_range': 0.1365543407930414, 'n_epochs'


--- PPO Optimization Finished ---
Number of finished trials:  50
Best trial:
  Value (Average Reward):  -7144.3330147
  Params: 
    learning_rate: 0.00018116731484174651
    n_steps: 512
    batch_size: 256
    gamma: 0.9520924671533344
    gae_lambda: 0.9117408365224235
    clip_range: 0.2297248521447981
    n_epochs: 16


--- Training the final PPO model with best hyperparameters ---


VBox(children=(HTML(value='Training final model: <b>PPO</b>'), IntProgress(value=0, description='Training PPO:…

Best model saved to: Output\PPO\best_model.pt

--- Evaluating the final PPO model ---
Evaluation logs saved to: Output\PPO\evaluation_logs.csv

Final PPO Metrics (Optimized):
Average Reward: -6.9684
Average Battery Health: 100.0000
Average Efficiency: 6.8184
Average Regret: 7.6503
Energy Fulfillment Rate: 23.3629
Evaluation logs saved to: Output\PPO\evaluation_logs.csv

Final PPO Metrics (Optimized):
Average Reward: -6.9684
Average Battery Health: 100.0000
Average Efficiency: 6.8184
Average Regret: 7.6503
Energy Fulfillment Rate: 23.3629
Evaluation plots saved to: Output\PPO

--- Training benchmark PPO model (default/minimal hyperparameters) ---
Evaluation plots saved to: Output\PPO

--- Training benchmark PPO model (default/minimal hyperparameters) ---


VBox(children=(HTML(value='Training benchmark model: <b>PPO</b>'), IntProgress(value=0, description='Benchmark…

Benchmark model saved to: Output\PPO\Benchmark\benchmark_model.pt

--- Evaluating benchmark PPO model ---
Benchmark evaluation logs saved to: Output\PPO\Benchmark\evaluation_logs.csv

Benchmark PPO Metrics:
Average Reward: -9.1960
Average Battery Health: 100.0000
Average Efficiency: 6.8129
Average Regret: 9.8772
Energy Fulfillment Rate: 1.2607
Benchmark evaluation logs saved to: Output\PPO\Benchmark\evaluation_logs.csv

Benchmark PPO Metrics:
Average Reward: -9.1960
Average Battery Health: 100.0000
Average Efficiency: 6.8129
Average Regret: 9.8772
Energy Fulfillment Rate: 1.2607
Benchmark plots saved to: Output\PPO\Benchmark
Benchmark plots saved to: Output\PPO\Benchmark
Comparison plots saved to: Output\PPO


--- Starting Optuna optimization for A2C with 50 trials ---
Comparison plots saved to: Output\PPO


--- Starting Optuna optimization for A2C with 50 trials ---


VBox(children=(HTML(value='Tuning model: <b>A2C</b>, Trial 0/50'), IntProgress(value=0, description='Tuning A2…

[I 2025-08-24 16:38:24,676] A new study created in memory with name: A2C_optimization
[I 2025-08-24 16:38:28,593] Trial 0 finished with value: -8000.9811248 and parameters: {'learning_rate': 0.0005502796309555912, 'n_steps': 12, 'gamma': 0.9680965167075559, 'gae_lambda': 0.9796744456781813, 'ent_coef': 0.0030742775604013315, 'vf_coef': 0.8494598761652397}. Best is trial 0 with value: -8000.9811248.
[I 2025-08-24 16:38:28,593] Trial 0 finished with value: -8000.9811248 and parameters: {'learning_rate': 0.0005502796309555912, 'n_steps': 12, 'gamma': 0.9680965167075559, 'gae_lambda': 0.9796744456781813, 'ent_coef': 0.0030742775604013315, 'vf_coef': 0.8494598761652397}. Best is trial 0 with value: -8000.9811248.
[I 2025-08-24 16:38:32,174] Trial 1 finished with value: -9299.6461042 and parameters: {'learning_rate': 3.0235189977341324e-05, 'n_steps': 36, 'gamma': 0.9526679748116379, 'gae_lambda': 0.9397953897667154, 'ent_coef': 4.753471090731842e-08, 'vf_coef': 0.7435220119585029}. Best is 


--- A2C Optimization Finished ---
Number of finished trials:  50
Best trial:
  Value (Average Reward):  -7046.229037200001
  Params: 
    learning_rate: 0.0005065908851927953
    n_steps: 5
    gamma: 0.9443363738868064
    gae_lambda: 0.9519178437190264
    ent_coef: 0.0011360833442974156
    vf_coef: 0.991431110170794


--- Training the final A2C model with best hyperparameters ---


VBox(children=(HTML(value='Training final model: <b>A2C</b>'), IntProgress(value=0, description='Training A2C:…

Best model saved to: Output\A2C\best_model.pt

--- Evaluating the final A2C model ---
Evaluation logs saved to: Output\A2C\evaluation_logs.csv

Final A2C Metrics (Optimized):
Average Reward: -13.9072
Average Battery Health: 52.9862
Average Efficiency: 6.8221
Average Regret: 14.5895
Energy Fulfillment Rate: 0.8311
Evaluation logs saved to: Output\A2C\evaluation_logs.csv

Final A2C Metrics (Optimized):
Average Reward: -13.9072
Average Battery Health: 52.9862
Average Efficiency: 6.8221
Average Regret: 14.5895
Energy Fulfillment Rate: 0.8311
Evaluation plots saved to: Output\A2C

--- Training benchmark A2C model (default/minimal hyperparameters) ---
Evaluation plots saved to: Output\A2C

--- Training benchmark A2C model (default/minimal hyperparameters) ---


VBox(children=(HTML(value='Training benchmark model: <b>A2C</b>'), IntProgress(value=0, description='Benchmark…

Benchmark model saved to: Output\A2C\Benchmark\benchmark_model.pt

--- Evaluating benchmark A2C model ---
Benchmark evaluation logs saved to: Output\A2C\Benchmark\evaluation_logs.csv

Benchmark A2C Metrics:
Average Reward: -12.8895
Average Battery Health: 53.3324
Average Efficiency: 6.8192
Average Regret: 13.5714
Energy Fulfillment Rate: 12.6342
Benchmark evaluation logs saved to: Output\A2C\Benchmark\evaluation_logs.csv

Benchmark A2C Metrics:
Average Reward: -12.8895
Average Battery Health: 53.3324
Average Efficiency: 6.8192
Average Regret: 13.5714
Energy Fulfillment Rate: 12.6342
Benchmark plots saved to: Output\A2C\Benchmark
Benchmark plots saved to: Output\A2C\Benchmark
Comparison plots saved to: Output\A2C


--- Starting Optuna optimization for DDPG with 50 trials ---
Comparison plots saved to: Output\A2C


--- Starting Optuna optimization for DDPG with 50 trials ---


VBox(children=(HTML(value='Tuning model: <b>DDPG</b>, Trial 0/50'), IntProgress(value=0, description='Tuning D…

[I 2025-08-24 16:42:49,132] A new study created in memory with name: DDPG_optimization
[I 2025-08-24 16:43:28,883] Trial 0 finished with value: -9302.888165400002 and parameters: {'learning_rate': 0.0003103129516559887, 'buffer_size': 62157, 'learning_starts': 907, 'tau': 0.0095556458995609, 'gamma': 0.9863940826773119}. Best is trial 0 with value: -9302.888165400002.
[I 2025-08-24 16:43:28,883] Trial 0 finished with value: -9302.888165400002 and parameters: {'learning_rate': 0.0003103129516559887, 'buffer_size': 62157, 'learning_starts': 907, 'tau': 0.0095556458995609, 'gamma': 0.9863940826773119}. Best is trial 0 with value: -9302.888165400002.
[I 2025-08-24 16:44:10,536] Trial 1 finished with value: -7088.7151501 and parameters: {'learning_rate': 1.3166186289350476e-05, 'buffer_size': 73162, 'learning_starts': 398, 'tau': 0.008352904354376574, 'gamma': 0.9776869575032219}. Best is trial 1 with value: -7088.7151501.
[I 2025-08-24 16:44:10,536] Trial 1 finished with value: -7088.71515


--- DDPG Optimization Finished ---
Number of finished trials:  50
Best trial:
  Value (Average Reward):  -6926.8205674
  Params: 
    learning_rate: 2.2533924431059282e-05
    buffer_size: 83777
    learning_starts: 785
    tau: 0.002470288179732512
    gamma: 0.9554997183218932


--- Training the final DDPG model with best hyperparameters ---


VBox(children=(HTML(value='Training final model: <b>DDPG</b>'), IntProgress(value=0, description='Training DDP…

Best model saved to: Output\DDPG\best_model.pt

--- Evaluating the final DDPG model ---
Evaluation logs saved to: Output\DDPG\evaluation_logs.csv

Final DDPG Metrics (Optimized):
Average Reward: -14.1062
Average Battery Health: 48.2795
Average Efficiency: 6.8025
Average Regret: 14.7864
Energy Fulfillment Rate: 0.0000
Evaluation logs saved to: Output\DDPG\evaluation_logs.csv

Final DDPG Metrics (Optimized):
Average Reward: -14.1062
Average Battery Health: 48.2795
Average Efficiency: 6.8025
Average Regret: 14.7864
Energy Fulfillment Rate: 0.0000
Evaluation plots saved to: Output\DDPG

--- Training benchmark DDPG model (default/minimal hyperparameters) ---
Evaluation plots saved to: Output\DDPG

--- Training benchmark DDPG model (default/minimal hyperparameters) ---


VBox(children=(HTML(value='Training benchmark model: <b>DDPG</b>'), IntProgress(value=0, description='Benchmar…

Benchmark model saved to: Output\DDPG\Benchmark\benchmark_model.pt

--- Evaluating benchmark DDPG model ---
Benchmark evaluation logs saved to: Output\DDPG\Benchmark\evaluation_logs.csv

Benchmark DDPG Metrics:
Average Reward: -13.0130
Average Battery Health: 50.0007
Average Efficiency: 6.8241
Average Regret: 13.6954
Energy Fulfillment Rate: 11.2519
Benchmark evaluation logs saved to: Output\DDPG\Benchmark\evaluation_logs.csv

Benchmark DDPG Metrics:
Average Reward: -13.0130
Average Battery Health: 50.0007
Average Efficiency: 6.8241
Average Regret: 13.6954
Energy Fulfillment Rate: 11.2519
Benchmark plots saved to: Output\DDPG\Benchmark
Benchmark plots saved to: Output\DDPG\Benchmark
Comparison plots saved to: Output\DDPG


--- Starting Optuna optimization for SAC with 50 trials ---
Comparison plots saved to: Output\DDPG


--- Starting Optuna optimization for SAC with 50 trials ---


VBox(children=(HTML(value='Tuning model: <b>SAC</b>, Trial 0/50'), IntProgress(value=0, description='Tuning SA…

[I 2025-08-24 17:20:55,508] A new study created in memory with name: SAC_optimization
[I 2025-08-24 17:21:50,812] Trial 0 finished with value: -7211.1954902 and parameters: {'learning_rate': 6.23925466520254e-05, 'buffer_size': 66947, 'learning_starts': 913, 'gamma': 0.9127230300744852, 'tau': 0.007595442442876287, 'ent_coef': 0.003579038156193848}. Best is trial 0 with value: -7211.1954902.
[I 2025-08-24 17:21:50,812] Trial 0 finished with value: -7211.1954902 and parameters: {'learning_rate': 6.23925466520254e-05, 'buffer_size': 66947, 'learning_starts': 913, 'gamma': 0.9127230300744852, 'tau': 0.007595442442876287, 'ent_coef': 0.003579038156193848}. Best is trial 0 with value: -7211.1954902.
[I 2025-08-24 17:22:46,656] Trial 1 finished with value: -7049.112363 and parameters: {'learning_rate': 0.0001025794008049962, 'buffer_size': 27192, 'learning_starts': 454, 'gamma': 0.9000829750191179, 'tau': 0.006404996508905822, 'ent_coef': 6.483573521283427e-07}. Best is trial 1 with value: -


--- SAC Optimization Finished ---
Number of finished trials:  50
Best trial:
  Value (Average Reward):  -6939.6620612
  Params: 
    learning_rate: 1.6180100640726737e-05
    buffer_size: 29313
    learning_starts: 199
    gamma: 0.9466410604462988
    tau: 0.0020586569840677586
    ent_coef: 8.471032679538403e-05


--- Training the final SAC model with best hyperparameters ---


VBox(children=(HTML(value='Training final model: <b>SAC</b>'), IntProgress(value=0, description='Training SAC:…

Best model saved to: Output\SAC\best_model.pt

--- Evaluating the final SAC model ---
Evaluation logs saved to: Output\SAC\evaluation_logs.csv

Final SAC Metrics (Optimized):
Average Reward: -7.0520
Average Battery Health: 100.0000
Average Efficiency: 6.8204
Average Regret: 7.7340
Energy Fulfillment Rate: 22.5099
Evaluation logs saved to: Output\SAC\evaluation_logs.csv

Final SAC Metrics (Optimized):
Average Reward: -7.0520
Average Battery Health: 100.0000
Average Efficiency: 6.8204
Average Regret: 7.7340
Energy Fulfillment Rate: 22.5099
Evaluation plots saved to: Output\SAC

--- Training benchmark SAC model (default/minimal hyperparameters) ---
Evaluation plots saved to: Output\SAC

--- Training benchmark SAC model (default/minimal hyperparameters) ---


VBox(children=(HTML(value='Training benchmark model: <b>SAC</b>'), IntProgress(value=0, description='Benchmark…

Benchmark model saved to: Output\SAC\Benchmark\benchmark_model.pt

--- Evaluating benchmark SAC model ---
Benchmark evaluation logs saved to: Output\SAC\Benchmark\evaluation_logs.csv

Benchmark SAC Metrics:
Average Reward: -8.7100
Average Battery Health: 97.3816
Average Efficiency: 6.8193
Average Regret: 9.3919
Energy Fulfillment Rate: 20.4502
Benchmark evaluation logs saved to: Output\SAC\Benchmark\evaluation_logs.csv

Benchmark SAC Metrics:
Average Reward: -8.7100
Average Battery Health: 97.3816
Average Efficiency: 6.8193
Average Regret: 9.3919
Energy Fulfillment Rate: 20.4502
Benchmark plots saved to: Output\SAC\Benchmark
Benchmark plots saved to: Output\SAC\Benchmark
Comparison plots saved to: Output\SAC


--- Starting Optuna optimization for DQN with 50 trials ---
Comparison plots saved to: Output\SAC


--- Starting Optuna optimization for DQN with 50 trials ---


VBox(children=(HTML(value='Tuning model: <b>DQN</b>, Trial 0/50'), IntProgress(value=0, description='Tuning DQ…

[I 2025-08-24 18:10:57,532] A new study created in memory with name: DQN_optimization
[I 2025-08-24 18:11:00,669] Trial 0 finished with value: -12616.1194479 and parameters: {'learning_rate': 0.0007027324174063731, 'buffer_size': 60824, 'learning_starts': 225, 'gamma': 0.9139382303177153, 'exploration_fraction': 0.31943304277984674, 'exploration_final_eps': 0.04224606695735119, 'train_freq': 9, 'target_update_interval': 109}. Best is trial 0 with value: -12616.1194479.
[I 2025-08-24 18:11:00,669] Trial 0 finished with value: -12616.1194479 and parameters: {'learning_rate': 0.0007027324174063731, 'buffer_size': 60824, 'learning_starts': 225, 'gamma': 0.9139382303177153, 'exploration_fraction': 0.31943304277984674, 'exploration_final_eps': 0.04224606695735119, 'train_freq': 9, 'target_update_interval': 109}. Best is trial 0 with value: -12616.1194479.
[I 2025-08-24 18:11:04,034] Trial 1 finished with value: -12961.5757816 and parameters: {'learning_rate': 0.00012171027263031501, 'buffer_


--- DQN Optimization Finished ---
Number of finished trials:  50
Best trial:
  Value (Average Reward):  -9620.851725199998
  Params: 
    learning_rate: 0.0005657183734754175
    buffer_size: 55215
    learning_starts: 369
    gamma: 0.9847743450837877
    exploration_fraction: 0.23562128474825983
    exploration_final_eps: 0.044867269922760605
    train_freq: 3
    target_update_interval: 166


--- Training the final DQN model with best hyperparameters ---


VBox(children=(HTML(value='Training final model: <b>DQN</b>'), IntProgress(value=0, description='Training DQN:…

Best model saved to: Output\DQN\best_model.pt

--- Evaluating the final DQN model ---
Evaluation logs saved to: Output\DQN\evaluation_logs.csv

Final DQN Metrics (Optimized):
Average Reward: -11.6054
Average Battery Health: 90.0000
Average Efficiency: 6.8138
Average Regret: 12.2868
Energy Fulfillment Rate: 17.8026
Evaluation logs saved to: Output\DQN\evaluation_logs.csv

Final DQN Metrics (Optimized):
Average Reward: -11.6054
Average Battery Health: 90.0000
Average Efficiency: 6.8138
Average Regret: 12.2868
Energy Fulfillment Rate: 17.8026
Evaluation plots saved to: Output\DQN

--- Training benchmark DQN model (default/minimal hyperparameters) ---
Evaluation plots saved to: Output\DQN

--- Training benchmark DQN model (default/minimal hyperparameters) ---


VBox(children=(HTML(value='Training benchmark model: <b>DQN</b>'), IntProgress(value=0, description='Benchmark…

Benchmark model saved to: Output\DQN\Benchmark\benchmark_model.pt

--- Evaluating benchmark DQN model ---
Benchmark evaluation logs saved to: Output\DQN\Benchmark\evaluation_logs.csv

Benchmark DQN Metrics:
Average Reward: -11.5629
Average Battery Health: 90.0000
Average Efficiency: 6.8244
Average Regret: 12.2454
Energy Fulfillment Rate: 17.6391
Benchmark evaluation logs saved to: Output\DQN\Benchmark\evaluation_logs.csv

Benchmark DQN Metrics:
Average Reward: -11.5629
Average Battery Health: 90.0000
Average Efficiency: 6.8244
Average Regret: 12.2454
Energy Fulfillment Rate: 17.6391
Benchmark plots saved to: Output\DQN\Benchmark
Benchmark plots saved to: Output\DQN\Benchmark
Comparison plots saved to: Output\DQN



--- All models have been processed. The script has completed. ---
Comparison plots saved to: Output\DQN



--- All models have been processed. The script has completed. ---
