In [1]:
!pip install pypsa
!pip install neptune-client
!pip install gymnasium



In [2]:
import pypsa
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces

import gc
import psutil
import matplotlib.pyplot as plt

import neptune

from torch.utils.data import TensorDataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F



In [3]:
def calculate_offset_k_initialization(network_file, k_method='mean', k_samples=1000, **env_kwargs):
  """
  Calculate the offset k for replacement reward method.
  Creates a temporary environment to perform the calculation.

  Parameters:
  -----------
  network_file : str
      Path to the PyPSA network file
  input_dir : str
      Directory containing constraint mappings
  k_method : str
      Method to calculate k: 'mean', 'worst_case', or 'percentile'
  k_samples : int
      Number of random samples to use for estimation
  **env_kwargs : dict
      Additional keyword arguments for environment creation

  Returns:
  --------
  float: Offset value k
  """
  print(f"Sampling {k_samples} random states to calculate offset k...")

  # Create temporary environment without offset_k (uses default)
  temp_env = EnvDispatchReplacement(network_file=network_file, **env_kwargs)
  #this initializes episode_length to number of snapshots and constraint_penalty_factor to None.
  #I'm just making this env to access certain attributes/ methods; which should be fine since none of these attributes/methods reference these two parameters.

  action_dim = temp_env.action_space.shape[0]

  objective_values = []
  successful_samples = 0
  try:
        for i in range(k_samples):
            try:
                # Reset environment to start fresh
                temp_env.reset(seed=42 + i)  # Use different seeds for variety
                # Sample random action
                random_action = temp_env.action_space.sample()  # This ensures [0,1] range
                # Take step - this handles all action scaling and application
                # make sure to get snapshot before the step to use when call evaluate_objective_direc() again
                #i increase the current snapshot_idx by executing step() if do this line after step, when evaluate_objective_direct() is run it evaluates the objective for the next step!
                current_snapshot = temp_env.network.snapshots[temp_env.snapshot_idx]
                obs, reward, terminated, truncated, info = temp_env.step(random_action)
                # Get the base objective value (the -J(s) part, before any penalties or offsets)
                obj_value = temp_env.evaluate_objective_direct(current_snapshot)
                objective_values.append(obj_value)
                successful_samples += 1
                #print(info["constraint_violations"])

                # Progress indicator every 200 samples
                if (i + 1) % 200 == 0:
                    print(f"  Completed {i + 1}/{k_samples} samples...")

            except Exception as e:
                # Skip failed samples but continue
                if i < 5:  # Only print first few errors to avoid spam
                    print(f"  Sample {i} failed: {e}")
                continue

        # Calculate offset based on method
        if objective_values:
            if k_method == 'worst_case':
                k = abs(max(objective_values))
                print(f"  Using worst-case method: k = |{max(objective_values):.2f}| = {k:.2f}")
            else:  # method == 'mean'
                mean_val = np.mean(objective_values)
                k = abs(mean_val)
                print(f"  Using mean method: k = |{mean_val:.2f}| = {k:.2f}")

            print(f"  Successfully sampled {successful_samples}/{k_samples} states")
            print(f"  Objective value range: [{min(objective_values):.2f}, {max(objective_values):.2f}]")
        else:
            print("  Warning: No successful samples, using default k value")
            k = 2500  # Default fallback value

  except Exception as e:
          print(f"Error in offset calculation: {e}")
          import traceback
          traceback.print_exc()
          k = 2500  # Default fallback value
  return k

In [4]:
def fix_artificial_lines_reasonable(network):
    """
    Fix artificial lines with reasonable capacity values:
    - s_nom = based on connected bus demand (with safety factor)
    - s_nom_extendable = False (non-extendable)
    - Keep capacity high enough to meet demand
    """
    print("=== FIXING ARTIFICIAL LINES WITH REASONABLE CAPACITY ===")

    # Find artificial lines
    artificial_lines = [line for line in network.lines.index
                       if any(keyword in str(line).lower() for keyword in ['new', '<->', 'artificial'])]

    if not artificial_lines:
        # If no artificial lines found by name, look for lines with s_nom=0
        # which is often a sign of artificial lines
        zero_capacity_lines = network.lines[network.lines.s_nom == 0].index.tolist()
        if zero_capacity_lines:
            artificial_lines = zero_capacity_lines

    print(f"Found {len(artificial_lines)} artificial lines to fix:")

    # Get maximum demand per bus across all snapshots
    bus_max_demand = {}
    for bus in network.buses.index:
        bus_demand = 0
        for load_name, load in network.loads.iterrows():
            if load.bus == bus and load_name in network.loads_t.p_set.columns:
                bus_demand = max(bus_demand, network.loads_t.p_set[load_name].max())
        bus_max_demand[bus] = bus_demand

    # Fix each artificial line with reasonable capacity
    for line_name in artificial_lines:
        # Get connected buses
        bus0 = network.lines.loc[line_name, 'bus0']
        bus1 = network.lines.loc[line_name, 'bus1']

        # Get maximum demand at these buses
        bus0_demand = bus_max_demand.get(bus0, 0)
        bus1_demand = bus_max_demand.get(bus1, 0)

        # Calculate required capacity with safety factor
        # Use 3x the higher demand to ensure adequate capacity
        safety_factor = 3.0
        required_capacity = max(bus0_demand, bus1_demand) * safety_factor

        # Ensure minimum reasonable capacity (1000 MW)
        required_capacity = max(required_capacity, 1000)

        print(f"\n🔧 Fixing: {line_name}")
        print(f"    Connected buses: {bus0} ↔ {bus1}")
        print(f"    Bus demands: {bus0}: {bus0_demand:.1f} MW, {bus1}: {bus1_demand:.1f} MW")

        # Set s_nom to required capacity
        old_s_nom = network.lines.loc[line_name, 's_nom']
        network.lines.loc[line_name, 's_nom'] = required_capacity
        print(f"    s_nom: {old_s_nom} → {required_capacity:.1f} MW")

        # Make sure line is not extendable
        if 's_nom_extendable' not in network.lines.columns:
            network.lines['s_nom_extendable'] = False
        network.lines.loc[line_name, 's_nom_extendable'] = False
        print(f"    s_nom_extendable: → False")

    return network

def create_pypsa_network(network_file):
    """Create a PyPSA network from the .nc file."""
    # Initialize network
    network = pypsa.Network(network_file)
    for storage_name in network.storage_units.index:
        # Use .loc for direct assignment to avoid SettingWithCopyWarning
        network.storage_units.loc[storage_name, 'cyclic_state_of_charge'] = False

        # Set marginal_cost to 0.01
        network.storage_units.loc[storage_name, 'marginal_cost'] = 0.01

        # Set marginal_cost_storage to 0.01
        network.storage_units.loc[storage_name, 'marginal_cost_storage'] = 0.01

        # Set spill_cost to 0.1
        network.storage_units.loc[storage_name, 'spill_cost'] = 0.1

        # Fix unrealistic max_hours values
        current_max_hours = network.storage_units.loc[storage_name, 'max_hours']

        if 'PHS' in storage_name:
            # PHS with missing data - set to typical range
            network.storage_units.loc[storage_name, 'max_hours'] = 8.0
            print(f"Fixed {storage_name}: set max_hours to 8.0")

        elif 'hydro' in storage_name:
            # Hydro with unrealistic data - set to validated range
            network.storage_units.loc[storage_name, 'max_hours'] = 6.0
            print(f"Fixed {storage_name}: corrected max_hours from {current_max_hours} to 6.0")


    fix_artificial_lines_reasonable(network)

    return network

In [5]:
#Install debugger
!pip install ipdb
import ipdb



In [6]:
import ipdb
class EnvDispatchConstr(gym.Env):
    """
    OpenAI Gym environment for Optimal Power Flow using PyPSA.
    Enhanced to handle dispatchable generators, renewable generators, and storage units.

    Action Space: Continuous setpoints for all controllable components within their capacity limits
    - Dispatchable generators: scaled between p_min_pu*p_nom and p_max_pu*p_nom
    - Renewable generators: scaled between 0 and current p_max_pu*p_nom (time-varying)
    - Storage units: scaled between -p_nom (charging) and +p_nom (discharging)
    (This follows http://arxiv.org/abs/2403.17831.)

    Has train/test split functionality
    """

    def __init__(self,network_file, constraint_penalty_factor=100, test_start_date='2013-12-01 00:00:00',
                 fixed_episode_length=None):
        super().__init__()

        self.network_file = network_file # Store network file path

        # Use provided network or create new one
        self.network =create_pypsa_network(network_file)

        self.test_start_date = pd.Timestamp(test_start_date)
        self._train_test_snapshots()

        #self._initialize_optimization_components()
        self.penalty_factor=constraint_penalty_factor
        self.reward_method = "summation" # Default reward method for the base class

        # Handle episode length configuration
        self.fixed_episode_length = fixed_episode_length
        if self.fixed_episode_length is not None:
            # Use fixed episode length, ensure it doesn't exceed training data
            self.episode_length = min(self.fixed_episode_length, self.train_snapshots)
            self.variable_episodes = False
            print(f"Using fixed episode length: {self.episode_length}")
        else:
            # Use legacy episode_length_factor or default to full training set
            self.episode_length = None
            self.variable_episodes = True
            print(f"Using variable episode length, max: {self.episode_length}")

        # Episode management
        self.current_step = 0  # Steps within current episode
        self.snapshot_idx = 0  # Current snapshot index (cycles through all snapshots)

        # Initialize component categorization
        self._categorize_components()

        # Create action space
        self._create_action_space()

        # Initialize the network state
        self.reset()

        # Create observation space
        low_bounds, high_bounds = self.create_observation_bounds()
        self.observation_space = spaces.Box(
            low=low_bounds,
            high=high_bounds,
            dtype=np.float32
        )
        #TO DO: If use .pf instead of .lpf, add another of each term for active power AND reactive power

    # def _initialize_optimization_components(self):
    #     """
    #     Initialize all optimization components in one pass to avoid creating multiple models.
    #     This method:
    #     1. Creates the optimization model once
    #     2. Extracts objective components (vars, coeffs, const)
    #     3. Creates the variable ID to name mapping
    #     4. Extracts constraints
    #     5. Cleans up the model
    #     """
    #     # Create model once - this is an expensive operation
    #     temp_model = self.network.optimize.create_model()

    #     # Extract objective components
    #     obj_expr = temp_model.objective
    #     objective_coeffs = obj_expr.coeffs.copy()
    #     self.coeffs_flat = objective_coeffs.values.flatten()
    #     self.objective_const = obj_expr.const.copy() if hasattr(obj_expr, 'const') else 0

    #     self.variable_names_dict= load_dictionary(network_file=self.network_file, dict_name="variable_names_dict", input_dir=self.input_dir)
    #     self.var_indices_dict= load_dictionary(network_file=self.network_file, dict_name="var_indices_dict", input_dir=self.input_dir)

    #     # Clean up to free memory
    #     del temp_model, obj_expr
    #     gc.collect()

    def _train_test_snapshots(self):
      # Calculate train/test split based on specific start date
        self.total_snapshots = len(self.network.snapshots)

        self.train_snapshots = self.network.snapshots.get_loc(self.test_start_date)
        nearest_idx = self.network.snapshots.get_indexer([self.test_start_date], method='nearest')[0]
        self.train_snapshots = nearest_idx
        actual_test_start = self.network.snapshots[nearest_idx]
        print(f"Exact test start date not found. Using nearest: {actual_test_start}")

        self.test_snapshots = self.total_snapshots - self.train_snapshots

        # Ensure we have enough data
        if self.train_snapshots <= 0:
            raise ValueError(f"Test start date {self.test_start_date} is too early. "
                           f"No training data available.")
        if self.test_snapshots <= 0:
            raise ValueError(f"Test start date {self.test_start_date} is too late. "
                           f"No test data available.")

    def _categorize_components(self):
        """
        Categorize generators and identify storage units for action space.
        """
        # Get generators with time-varying p_max_pu (renewable generators)
        renewable_gens = self.network.generators_t.p_max_pu.columns

        slack_generators = self.network.generators[self.network.generators.control == "Slack"].index
        # in the 10-node SA network there are 4 slack gens so this should return a list of indexes

        # Dispatchable generators: not slack, not renewable
        self.dispatchable_gens = self.network.generators[
            (~self.network.generators.index.isin(slack_generators)) &
            (~self.network.generators.index.isin(renewable_gens))
        ].index

        # Renewable generators: have time-varying p_max_pu, not slack
        self.renewable_gens = self.network.generators[
            (self.network.generators.index.isin(renewable_gens)) &
            (~self.network.generators.index.isin(slack_generators))
        ].index

        # Storage units (if any exist in the network)
        self.storage_units = self.network.storage_units.index

        # Store names as lists for easier indexing
        self.dispatchable_names = list(self.dispatchable_gens)
        self.renewable_names = list(self.renewable_gens)
        self.storage_names = list(self.storage_units)

        # Store counts
        self.n_dispatchable = len(self.dispatchable_names)
        self.n_renewable = len(self.renewable_names)
        self.n_storage = len(self.storage_names)

        # Get static limits for dispatchable generators
        if self.n_dispatchable > 0:
            dispatchable_df = self.network.generators.loc[self.dispatchable_gens]
            self.disp_p_min = (dispatchable_df.p_min_pu * dispatchable_df.p_nom).values#returns numpy arrays
            self.disp_p_max = (dispatchable_df.p_max_pu * dispatchable_df.p_nom).values
        else:
            self.disp_p_min = np.array([])
            self.disp_p_max = np.array([])

        # Get nominal capacities and minimum limits for renewable generators
        if self.n_renewable > 0:
            renewable_df = self.network.generators.loc[self.renewable_gens]
            self.renewable_p_nom = renewable_df.p_nom.values
            self.renewable_p_min_pu = renewable_df.p_min_pu.values
        else:
            self.renewable_p_nom = np.array([])
            self.renewable_p_min_pu = np.array([])

        # Get storage unit capacities
        if self.n_storage > 0:
            storage_df = self.network.storage_units.loc[self.storage_units]
            #this is a bit redundant since self.storage_units is the array of all indices of self.network.storage_units but leave it in so could replace which indices you want
            self.storage_p_nom = storage_df.p_nom.values
        else:
            self.storage_p_nom = np.array([])


    def _create_action_space(self):
        """
        Create action space with four distinct parts:
        1. Dispatchable generators: [0,1] scaled to [p_min, p_max]
        2. Renewable generators: [0,1] scaled to [0, current_p_max_pu * p_nom]
        3. Storage p_set: [0,1] scaled to [-p_nom, +p_nom] (negative=charging, positive=discharging)
        4. Storage p_dispatch: [0,1] scaled to [0, p_nom] (discharging magnitude)
        """
        total_actions = self.n_dispatchable + self.n_renewable + (2 * self.n_storage)  # 2 actions per storage unit
        self.action_space = gym.spaces.Box(0, 1, shape=(total_actions,))

        # Store action space structure for easy reference
        self.action_structure = {
            'dispatchable': {
                'start': 0,
                'end': self.n_dispatchable,
                'count': self.n_dispatchable
            },
            'renewable': {
                'start': self.n_dispatchable,
                'end': self.n_dispatchable + self.n_renewable,
                'count': self.n_renewable
            },
            'storage_p_set': {
                'start': self.n_dispatchable + self.n_renewable,
                'end': self.n_dispatchable + self.n_renewable + self.n_storage,
                'count': self.n_storage
            },
            'storage_p_dispatch': {
                'start': self.n_dispatchable + self.n_renewable + self.n_storage,
                'end': self.n_dispatchable + self.n_renewable + (2 * self.n_storage),
                'count': self.n_storage
            }
        }

    def _get_storage_observation(self):
        """
        Get current storage unit states for observation.
        Returns previous SOC (normalized) and current inflow (normalized) for each storage unit.
        """
        if self.n_storage == 0:
            return np.array([])

        current_snapshot = self.network.snapshots[self.snapshot_idx]
        storage_obs = []

        for storage_name in self.storage_names:
            # 1. Previous State of Charge (using your exact logic)
            if self.snapshot_idx == 0:
                # For first snapshot, previous SOC is the initial value
                soc_prev = self.network.storage_units.state_of_charge_initial.loc[storage_name]
            else:
                previous_snapshot = self.network.snapshots[self.snapshot_idx - 1]
                soc_prev = self.network.storage_units_t.state_of_charge.loc[previous_snapshot, storage_name]

            # Get SOC limit from PyPSA parameters
            p_nom = self.network.storage_units.loc[storage_name, 'p_nom']
            max_hours = self.network.storage_units.loc[storage_name, 'max_hours']
            max_soc = p_nom * max_hours  # SOC limit (energy capacity)

            # Normalize SOC by its maximum possible value
            normalized_soc_prev = soc_prev / max_soc if max_soc > 0 else 0
            storage_obs.append(normalized_soc_prev)

            # 2. Current Inflow (normalized by p_nom)
            if hasattr(self.network.storage_units_t, 'inflow') and storage_name in self.network.storage_units_t.inflow.columns:
                current_inflow = self.network.storage_units_t.inflow.loc[current_snapshot, storage_name]
                # Normalize by p_nom for consistent scaling
                normalized_inflow = current_inflow / p_nom if p_nom > 0 else 0
            else:
                # If no inflow data exists, use zero
                normalized_inflow = 0.0
            storage_obs.append(normalized_inflow)

        return np.array(storage_obs, dtype=np.float32)

    def create_storage_observation_bounds(self):
        """
        Create bounds for storage unit observations.
        Since no spill: SOC bounds are always [0, 1] when normalized.
        """
        if self.n_storage == 0:
            return np.array([]), np.array([])

        values_per_storage = 2  # SOC + inflow
        total_storage_obs = self.n_storage * values_per_storage

        low_bounds = np.zeros(total_storage_obs)
        high_bounds = np.zeros(total_storage_obs)

        for i, storage_name in enumerate(self.storage_names):
            base_idx = i * values_per_storage

            # SOC bounds: Always [0, 1] when normalized by (p_nom * max_hours)
            low_bounds[base_idx] = 0.0      # Normalized SOC min
            high_bounds[base_idx] = 1.0     # Normalized SOC max

            # Inflow bounds: Get from historical data
            if hasattr(self.network.storage_units_t, 'inflow'):
                p_nom = self.network.storage_units.loc[storage_name, 'p_nom']
                if storage_name in self.network.storage_units_t.inflow.columns:
                    inflow_data = self.network.storage_units_t.inflow[storage_name]

                    min_inflow_norm = inflow_data.min() / p_nom if p_nom > 0 else 0
                    max_inflow_norm = inflow_data.max() / p_nom if p_nom > 0 else 0

                    low_bounds[base_idx + 1] = min_inflow_norm
                    high_bounds[base_idx + 1] = max_inflow_norm
                else:
                     # If inflow data exists but not for this specific storage unit
                    low_bounds[base_idx + 1] = 0.0
                    high_bounds[base_idx + 1] = 0.0
            else:
                # No inflow data
                low_bounds[base_idx + 1] = 0.0
                high_bounds[base_idx + 1] = 0.0

        return low_bounds.astype(np.float32), high_bounds.astype(np.float32)

    def create_observation_bounds(self):
        """
        Create bounds for the observation space based on:
        - Load p_set values
        - Renewable generator p_max_pu values
        - Storage unit previous SOC (normalized) and current inflow (normalized)
        """
        # 1. Load bounds
        load_p_set_all = self.network.loads_t.p_set  # DataFrame with all snapshots and loads
        load_low_bounds = load_p_set_all.min(axis=0).values  # Min across all snapshots for each load
        load_high_bounds = load_p_set_all.max(axis=0).values  # Max across all snapshots for each load

        # 2. Renewable generator bounds
        if self.n_renewable > 0:
            renewable_p_max_pu_all = self.network.generators_t.p_max_pu[self.renewable_names]
            renewable_low_bounds = renewable_p_max_pu_all.min(axis=0).values
            renewable_high_bounds = renewable_p_max_pu_all.max(axis=0).values
        else:
            renewable_low_bounds = np.array([])
            renewable_high_bounds = np.array([])

        # 3. Storage bounds (previous SOC + current inflow)
        storage_low_bounds, storage_high_bounds = self.create_storage_observation_bounds()

        # 4. Combine all bounds
        low_bounds = np.concatenate([load_low_bounds, renewable_low_bounds, storage_low_bounds])
        high_bounds = np.concatenate([load_high_bounds, renewable_high_bounds, storage_high_bounds])

        return low_bounds.astype(np.float32), high_bounds.astype(np.float32)

    def _get_observation(self):
        """
        Get current network state as observation.

        Returns observation vector with structure:
        [load_1_demand, load_2_demand, ..., load_n_demand,
        renewable_1_p_max_pu, renewable_2_p_max_pu, ..., renewable_m_p_max_pu,
        storage_1_prev_soc_norm, storage_1_current_inflow_norm,
        storage_2_prev_soc_norm, storage_2_current_inflow_norm,
        ...,
        storage_k_prev_soc_norm, storage_k_current_inflow_norm]
        """
        # 1. Load demands (dynamic values at current snapshot)
        load_demands = self.network.loads_t.p_set.iloc[self.snapshot_idx].values

        # 2. Renewable generator p_max_pu values (time-varying availability at current snapshot)
        if self.n_renewable > 0:
            renewable_p_max_pu = self.network.generators_t.p_max_pu.iloc[self.snapshot_idx][self.renewable_names].values
        else:
            renewable_p_max_pu = np.array([])

        # 3. Storage states (previous SOC normalized + current inflow normalized)
        storage_states = self._get_storage_observation()

        # 4. Combine all observations
        observation = np.concatenate([load_demands, renewable_p_max_pu, storage_states])

        return observation.astype(np.float32)

    def reset_network(self):
        """Reset and ensure essential DataFrames exist."""
        #Note that we do not just create a new network here, as this consumes more memory and previously led to a segmentation fault
        # we reset these ttributes for all snapshots, but they all start empty when the network is created so i think that's fine
        # Initialize/reset generators_t.p_set
        if not hasattr(self.network.generators_t, 'p_set') or self.network.generators_t.p_set.empty:
            self.network.generators_t.p_set = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.generators.index
            )
        else:
            self.network.generators_t.p_set.iloc[:, :] = 0.0

        # Initialize/reset storage_units_t attributes
        if not hasattr(self.network.storage_units_t, 'p_set') or self.network.storage_units_t.p_set.empty:
            self.network.storage_units_t.p_set = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.storage_units.index
            )
        else:
            self.network.storage_units_t.p_set.iloc[:, :] = 0.0


        if not hasattr(self.network.storage_units_t, 'p_dispatch') or self.network.storage_units_t.p_dispatch.empty:
            self.network.storage_units_t.p_dispatch = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.storage_units.index
            )
        else:
            self.network.storage_units_t.p_dispatch.iloc[:, :] = 0.0

        if not hasattr(self.network.storage_units_t, 'p_store') or self.network.storage_units_t.p_store.empty:
            self.network.storage_units_t.p_store = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.storage_units.index
            )
        else:
            self.network.storage_units_t.p_store.iloc[:, :] = 0.0

        if not hasattr(self.network.storage_units_t, 'state_of_charge') or self.network.storage_units_t.state_of_charge.empty:
            self.network.storage_units_t.state_of_charge = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.storage_units.index
            )
        else:
            self.network.storage_units_t.state_of_charge.iloc[:, :] = 0.0

        if not hasattr(self.network.storage_units_t, 'spill') or self.network.storage_units_t.spill.empty:
            self.network.storage_units_t.spill = pd.DataFrame(
                0.0,
                index=self.network.snapshots,
                columns=self.network.storage_units.index
            )
        else:
            self.network.storage_units_t.spill.iloc[:, :] = 0.0

    def reset(self, seed=None, options=None):
        """
        Reset to training data with proper episode length handling.
        """
        # Set the seed if provided
        if seed is not None:
            np.random.seed(seed)

        # Reset counters
        self.current_step = 0

        if self.fixed_episode_length is not None:
            # Fixed episode length: ensure episode can complete within training data
            max_start_idx = max(0, self.train_snapshots - self.episode_length)
            self.snapshot_idx = np.random.randint(0, max_start_idx + 1)
        else:
            # Variable episode length: can start anywhere in training data
            self.snapshot_idx = np.random.randint(0, self.train_snapshots)

        self.reset_network()

        obs = self._get_observation()
        info = {
            'current_step': self.current_step,
            'snapshot_idx': self.snapshot_idx,
            'is_training': True,
            'fixed_episodes': self.fixed_episode_length is not None,
            'variable_episodes': self.variable_episodes
        }

        return obs, info

    def reset_for_testing(self):
        """
        Reset environment to start of test data with proper SOC initialization.
        """
        # Reset to start of test period
        self.snapshot_idx = self.train_snapshots
        self.current_step = 0

        # Reset network state
        self.reset_network()

        # Set initial state of charge for storage units at test start
        if self.n_storage > 0:
            test_start_snapshot = self.network.snapshots[self.train_snapshots]
            for storage_name in self.storage_names:
                initial_soc = self.network.storage_units.loc[storage_name, 'state_of_charge_initial']
                self.network.storage_units_t.state_of_charge.loc[test_start_snapshot, storage_name] = initial_soc

        obs = self._get_observation()
        info = {
            'current_step': self.current_step,
            'snapshot_idx': self.snapshot_idx,
            'is_training': False
        }

        return obs, info

    def get_test_snapshots(self):
        """
        Return the test snapshots for external optimization.
        """
        return self.network.snapshots[self.train_snapshots:]

    def compute_storage_power_bounds(self):
        """
        Compute the feasible bounds for p_set and p_dispatch to respect SOC limits.

        Returns:
        --------
        dict: Contains bounds for each storage unit
        """
        bounds = {}

        for i, storage_name in enumerate(self.storage_names):
            # Get current SOC
            if self.snapshot_idx == 0:
                soc_prev = self.network.storage_units.state_of_charge_initial.loc[storage_name]
            else:
                previous_snapshot = self.network.snapshots[self.snapshot_idx - 1]
                soc_prev = self.network.storage_units_t.state_of_charge.loc[previous_snapshot, storage_name]

            # Get storage parameters
            storage_unit = self.network.storage_units.loc[storage_name]
            soc_max = storage_unit.p_nom * storage_unit.max_hours
            eff_store = storage_unit.efficiency_store
            eff_dispatch = storage_unit.efficiency_dispatch
            standing_loss = storage_unit.standing_loss

            # Get time step
            if hasattr(self.network.snapshot_weightings, 'stores'):
                delta_t = self.network.snapshot_weightings.stores.iloc[self.snapshot_idx]
            else:
                delta_t = self.network.snapshot_weightings.iloc[self.snapshot_idx]

            eff_standing = (1 - standing_loss) ** delta_t

            # Get inflow
            if storage_name in self.network.storage_units_t.inflow.columns:
                inflow = self.network.storage_units_t.inflow.loc[self.network.snapshots[self.snapshot_idx], storage_name]
            else:
                inflow = 0

            # Base SOC change from standing losses and inflow
            base_soc_change = soc_prev * eff_standing + inflow * delta_t

            # Energy bounds for net storage change
            min_net_energy = 0 - base_soc_change  # Don't go below 0 SOC
            max_net_energy = soc_max - base_soc_change  # Don't exceed max SOC

            # Convert to power bounds considering efficiencies
            # Net energy change = (p_store * eff_store - p_dispatch/eff_dispatch) * delta_t
            # So: min_net_energy ≤ (p_store * eff_store - p_dispatch/eff_dispatch) * delta_t ≤ max_net_energy

            # Physical power limits
            p_nom = storage_unit.p_nom

            bounds[storage_name] = {
                'soc_prev': soc_prev,
                'soc_max': soc_max,
                'min_net_energy_per_dt': min_net_energy / delta_t,
                'max_net_energy_per_dt': max_net_energy / delta_t,
                'eff_store': eff_store,
                'eff_dispatch': eff_dispatch,
                'p_nom': p_nom,
                'delta_t': delta_t
            }

        return bounds

    def scale_action(self, action):
        """
        Scale action from [0,1] range to appropriate ranges for each component type,
        with SOC bounds enforcement for storage units.
        """
        scaled_actions = {}

        # Scale dispatchable generator actions (unchanged)
        if self.n_dispatchable > 0:
            disp_actions = action[self.action_structure['dispatchable']['start']:
                                self.action_structure['dispatchable']['end']]
            scaled_actions['dispatchable'] = self.disp_p_min + disp_actions * (self.disp_p_max - self.disp_p_min)
        else:
            scaled_actions['dispatchable'] = np.array([])

        # Scale renewable generator actions (unchanged)
        if self.n_renewable > 0:
            renewable_actions = action[self.action_structure['renewable']['start']:
                                    self.action_structure['renewable']['end']]
            current_p_max_pu = self.network.generators_t.p_max_pu.iloc[self.snapshot_idx][self.renewable_names].values
            current_p_max = current_p_max_pu * self.renewable_p_nom
            current_p_min = self.renewable_p_min_pu * self.renewable_p_nom
            scaled_actions['renewable'] = current_p_min + renewable_actions * (current_p_max - current_p_min)
        else:
            scaled_actions['renewable'] = np.array([])

        # Scale storage actions with SOC bounds enforcement
        if self.n_storage > 0:
            # Get SOC-based bounds
            storage_bounds = self.compute_storage_power_bounds()
            storage_p_set_actions = action[self.action_structure['storage_p_set']['start']:
                                        self.action_structure['storage_p_set']['end']]
            storage_p_dispatch_actions = action[self.action_structure['storage_p_dispatch']['start']:
                                              self.action_structure['storage_p_dispatch']['end']]

            scaled_p_set = np.zeros(self.n_storage)
            scaled_p_dispatch = np.zeros(self.n_storage)

            for i, storage_name in enumerate(self.storage_names):
                bounds = storage_bounds[storage_name]
                p_nom = bounds['p_nom']

                # First, scale p_dispatch from [0,1] to [0, p_nom]
                p_dispatch_raw = storage_p_dispatch_actions[i] * p_nom

                # For given p_dispatch, find valid range for p_set
                # Constraint: min_net_energy_per_dt ≤ (p_store * eff_store - p_dispatch/eff_dispatch) ≤ max_net_energy_per_dt
                # Where p_store = p_dispatch - p_set
                # So: min_net_energy_per_dt ≤ ((p_dispatch - p_set) * eff_store - p_dispatch/eff_dispatch) ≤ max_net_energy_per_dt

                dispatch_energy_term = p_dispatch_raw / bounds['eff_dispatch']
                store_coeff = bounds['eff_store']

                # Rearranging: min_net_energy_per_dt ≤ p_dispatch_raw * eff_store - p_set * eff_store - dispatch_energy_term ≤ max_net_energy_per_dt
                # So: p_dispatch_raw * eff_store - dispatch_energy_term - max_net_energy_per_dt ≤ p_set * eff_store ≤ p_dispatch_raw * eff_store - dispatch_energy_term - min_net_energy_per_dt

                base_term = p_dispatch_raw * store_coeff - dispatch_energy_term
                p_set_min_from_soc = (base_term - bounds['max_net_energy_per_dt']) / store_coeff
                p_set_max_from_soc = (base_term - bounds['min_net_energy_per_dt']) / store_coeff

                # Also enforce physical limits: -p_nom ≤ p_set ≤ p_nom
                p_set_min = max(-p_nom, p_set_min_from_soc)
                p_set_max = min(p_nom, p_set_max_from_soc)

                ipdb.set_trace()

                # Ensure feasible range exists
                if p_set_min > p_set_max:
                    # If no feasible range, clip p_dispatch and recalculate
                    # This is a fallback - ideally shouldn't happen with proper bounds
                    p_dispatch_raw = min(p_dispatch_raw, p_nom * 0.5)  # Conservative fallback
                    ipdb.set_trace()
                    base_term = p_dispatch_raw * store_coeff - p_dispatch_raw / bounds['eff_dispatch']
                    ipdb.set_trace()
                    p_set_min_from_soc = (base_term - bounds['max_net_energy_per_dt']) / store_coeff
                    ipdb.set_trace()
                    p_set_max_from_soc = (base_term - bounds['min_net_energy_per_dt']) / store_coeff
                    ipdb.set_trace()
                    p_set_min = max(-p_nom, p_set_min_from_soc)
                    p_set_max = min(p_nom, p_set_max_from_soc)
                ipdb.set_trace()
                # Scale p_set action from [0,1] to [p_set_min, p_set_max]
                if p_set_max > p_set_min:
                    scaled_p_set[i] = p_set_min + storage_p_set_actions[i] * (p_set_max - p_set_min)
                else:
                    scaled_p_set[i] = p_set_min  # Fallback to minimum if range is degenerate

                scaled_p_dispatch[i] = p_dispatch_raw

            scaled_actions['storage_p_set'] = scaled_p_set
            scaled_actions['storage_p_dispatch'] = scaled_p_dispatch
        else:
            scaled_actions['storage_p_set'] = np.array([])
            scaled_actions['storage_p_dispatch'] = np.array([])
        ipdb.set_trace()
        return scaled_actions

    def _update_storage_soc_single_snapshot(self, storage_name):
        if self.snapshot_idx == 0:
            # For first snapshot, previous SOC is the initial value
            soc_prev = self.network.storage_units.state_of_charge_initial.loc[storage_name]
        else:
            previous_snapshot = self.network.snapshots[self.snapshot_idx - 1]
            soc_prev = self.network.storage_units_t.state_of_charge.loc[previous_snapshot, storage_name]

        current_snapshot = self.network.snapshots[self.snapshot_idx]

        #Get storage parameters
        storage_unit = self.network.storage_units.loc[storage_name]
        soc_max = storage_unit.p_nom * storage_unit.max_hours
        eff_store = storage_unit.efficiency_store
        eff_dispatch = storage_unit.efficiency_dispatch
        standing_loss = storage_unit.standing_loss

        # Get time step
        if hasattr(self.network.snapshot_weightings, 'stores'):
            delta_t = self.network.snapshot_weightings.stores.iloc[self.snapshot_idx]
        else:
            delta_t = self.network.snapshot_weightings.iloc[self.snapshot_idx]

        eff_standing = (1 - standing_loss) ** delta_t

        # Get current operations (these determine the SOC change)
        p_store = self.network.storage_units_t.p_store.loc[current_snapshot, storage_name]
        p_dispatch = self.network.storage_units_t.p_dispatch.loc[current_snapshot, storage_name]
        if storage_name in self.network.storage_units_t.inflow.columns:
          inflow = self.network.storage_units_t.inflow.loc[current_snapshot, storage_name]
        else:
          inflow=0

        # Calculate SOC without spill (could be non-zero even if soc_prev=0)
        soc_without_spill = (soc_prev * eff_standing +
                            (p_store * eff_store - p_dispatch/eff_dispatch + inflow) * delta_t)

        # Calculate required spill
        required_spill = max(0, (soc_without_spill - soc_max) / delta_t)

        # Final SOC after spill
        soc_actual = min(soc_without_spill, soc_max)

        # Update the network
        self.network.storage_units_t.state_of_charge.loc[current_snapshot, storage_name] = soc_actual
        if hasattr(self.network, 'storage_units_t') and 'spill' in self.network.storage_units_t:
            self.network.storage_units_t.spill.loc[current_snapshot, storage_name] = required_spill

    def evaluate_objective_direct(self, current_snapshot):
        """
        Direct evaluation of PyPSA operational objective function terms.

        This function evaluates only the operational terms (marginal costs) that PyPSA
        optimizes for generators and storage units, excluding capital costs and other
        investment-related terms.

        Returns
        -------
        float
            Total operational cost for the current snapshot including snapshot weighting
        """
        total_cost = 0.0

        # Get snapshot weighting for proper cost calculation
        snapshot_weighting = self.network.snapshot_weightings.objective.loc[current_snapshot]

        # Generator operational costs
        if len(self.network.generators) > 0:
            # Get marginal costs and power output
            gen_marginal_costs = self.network.generators['marginal_cost']

            gen_power = self.network.generators_t.p.loc[current_snapshot]

            # Calculate generator operational cost
            gen_cost = (gen_marginal_costs * gen_power).sum()
            total_cost += gen_cost

        # Storage unit operational costs
        if len(self.network.storage_units) > 0:
            # Marginal cost for storage dispatch (discharge)
            storage_marginal_costs = self.network.storage_units['marginal_cost']
            storage_p_dispatch = self.network.storage_units_t.p_dispatch.loc[current_snapshot]
            storage_cost = (storage_marginal_costs * storage_p_dispatch).sum()
            #multiply correpsonding entries of the pandas columns and then sum them
            total_cost += storage_cost

            # Marginal cost for storage charging
            storage_marginal_costs_storage = self.network.storage_units['marginal_cost_storage']
            storage_store_power = self.network.storage_units_t.p_store.loc[current_snapshot]
            storage_store_cost = (storage_marginal_costs_storage * storage_store_power).sum()
            total_cost += storage_store_cost

            spill_costs = self.network.storage_units['spill_cost']
            spill_amounts = self.network.storage_units_t.spill.loc[current_snapshot]
            spill_cost = (spill_costs * spill_amounts).sum()
            total_cost += spill_cost

        # Apply snapshot weighting (this is crucial for proper cost calculation)
        total_cost *= snapshot_weighting

        return total_cost

    def _calculate_reward(self):
        """Calculate reward using stored objective components."""
        # Get the current snapshot name

        current_snapshot = self.network.snapshots[self.snapshot_idx]
        return -1 * self.evaluate_objective_direct(current_snapshot)

    def calculate_constrained_reward(self):
        """
        Calculate reward using summation method with dynamic constraint checking.

        Summation method:
        - Reward = -J(s) - P(s)
        """
        try:
            # Get base reward from objective function (negative for minimization)
            base_reward = self._calculate_reward()

            current_snapshot = self.network.snapshots[self.snapshot_idx]

            # Initialize constraint tracking
            constraint_results = {
                'all_satisfied': True,
                'violations': {},
                'total_violation': 0.0,
                'violations_by_group': {}
            }

            # 1. Check slack generator constraints
            slack_generators = self.network.generators[self.network.generators.control == "Slack"].index
            if not slack_generators.empty:
                for gen_name in slack_generators:
                    # Get actual power output after power flow
                    p_actual = self.network.generators_t.p.loc[current_snapshot, gen_name]

                    # Get limits
                    p_min = self.network.generators.loc[gen_name, 'p_min_pu'] * self.network.generators.loc[gen_name, 'p_nom']
                    p_max = self.network.generators.loc[gen_name, 'p_max_pu'] * self.network.generators.loc[gen_name, 'p_nom']

                    # Check lower bound
                    if p_actual < p_min:
                        violation = float(p_min - p_actual)
                        constraint_name = f"Generator-slack-p-lower[snapshot={current_snapshot},Generator={gen_name}]"
                        constraint_results['violations'][constraint_name] = violation
                        constraint_results['total_violation'] += violation
                        constraint_results['all_satisfied'] = False

                    # Check upper bound
                    if p_actual > p_max:
                        violation = float(p_actual - p_max)
                        constraint_name = f"Generator-slack-p-upper[snapshot={current_snapshot},Generator={gen_name}]"
                        constraint_results['violations'][constraint_name] = violation
                        constraint_results['total_violation'] += violation
                        constraint_results['all_satisfied'] = False

            # 2. Check line flow constraints (CORRECTED)
            for line_name in self.network.lines.index:
                # Get line parameters
                s_nom = self.network.lines.loc[line_name, 's_nom']
                s_max_pu = 1.0  # Default, or get from lines_t.s_max_pu if it exists

                # Calculate active power limit (this is what PyPSA's linear constraints check)
                s_max = s_max_pu * s_nom

                # Get active power flow from the linear power flow
                # In PyPSA's linear formulation, this is the 's' variable value
                p0 = abs(self.network.lines_t.p0.loc[current_snapshot, line_name])

                # Check if active power flow exceeds limit
                if p0 > s_max:
                    violation = float(p0 - s_max)
                    constraint_name = f"Line-fix-s-upper[snapshot={current_snapshot},Line={line_name}]"
                    constraint_results['violations'][constraint_name] = violation
                    constraint_results['total_violation'] += violation
                    constraint_results['all_satisfied'] = False


            # Calculate penalty
            total_violation = float(constraint_results['total_violation'])
            penalty = self.penalty_factor * total_violation

            # Calculate final reward using summation method
            constrained_reward = base_reward - penalty

            # Ensure reward is a scalar
            if hasattr(constrained_reward, '__len__'):
                constrained_reward = float(constrained_reward)

            return constrained_reward, constraint_results

        except Exception as e:
            print(f"Error calculating summation reward: {e}")
            # Fall back to base reward on error
            return self._calculate_reward(), {
                'all_satisfied': True,
                'violations': {},
                'total_violation': 0.0
            }


    def step(self, action):
        """
        Execute one time step within the environment.

        Args:
            action: Array of setpoints for all controllable components [disp_gen1, disp_gen2, ...,
                   renewable_gen1, renewable_gen2, ..., storage1, storage2, ...]

        Returns:
            observation: Network state after action
            reward: Reward for this action
            terminated: Whether episode is finished due to task completion
            truncated: Whether episode is finished due to time limit
            info: Additional information
        """
        scaled_actions = self.scale_action(action)
        # Apply dispatchable generator setpoints
        if self.n_dispatchable > 0:
            for i, gen_name in enumerate(self.dispatchable_names):
                self.network.generators_t.p_set.iloc[self.snapshot_idx, self.network.generators_t.p_set.columns.get_loc(gen_name)] = scaled_actions['dispatchable'][i]

        # Apply renewable generator setpoints
        if self.n_renewable > 0:
            for i, gen_name in enumerate(self.renewable_names):
                self.network.generators_t.p_set.iloc[self.snapshot_idx,
                    self.network.generators_t.p_set.columns.get_loc(gen_name)] = scaled_actions['renewable'][i]

        # Apply storage unit setpoints
        if self.n_storage > 0:
            for i, storage_name in enumerate(self.storage_names):
                self.network.storage_units_t.p_set.iloc[self.snapshot_idx,
                    self.network.storage_units_t.p_set.columns.get_loc(storage_name)] = scaled_actions['storage_p_set'][i]
                self.network.storage_units_t.p_dispatch.iloc[self.snapshot_idx,
                    self.network.storage_units_t.p_dispatch.columns.get_loc(storage_name)] = scaled_actions['storage_p_dispatch'][i]
                self.network.storage_units_t.p_store.iloc[self.snapshot_idx,
                    self.network.storage_units_t.p_store.columns.get_loc(storage_name)] = scaled_actions['storage_p_dispatch'][i] - scaled_actions['storage_p_set'][i]

            # Update state of charge using PyPSA's energy balance equation
                if self.snapshot_idx > 0:
                    self._update_storage_soc_single_snapshot(storage_name)
        # Run power flow to get new network state
        try:
            self.network.lpf(self.network.snapshots[self.snapshot_idx])
            power_flow_converged = True
        except Exception as e:
            print(f"Power flow failed: {e}")
            power_flow_converged = False

        # Calculate reward using constrained reward function
        reward, constraint_results = self.calculate_constrained_reward()

        # Increment step counters
        self.current_step += 1
        self.snapshot_idx += 1

        # Handle cycling through snapshots
        if self.snapshot_idx >= self.total_snapshots:
            self.snapshot_idx = 0
            self.reset_network()

        # Get new observation
        observation = self._get_observation()

        # Check if episode is done
        episode_done = self._check_done()
        terminated = False
        truncated = episode_done

        # Additional info
        info = {
            'dispatchable_setpoints': scaled_actions['dispatchable'],
            'renewable_setpoints': scaled_actions['renewable'],
            'storage_p_set': scaled_actions['storage_p_set'],
            'storage_p_dispatch': scaled_actions['storage_p_dispatch'],
            'power_flow_converged': power_flow_converged,
            'dispatchable_names': self.dispatchable_names,
            'renewable_names': self.renewable_names,
            'storage_names': self.storage_names,
            'current_step': self.current_step,
            'snapshot_idx': self.snapshot_idx,
            'constraints_satisfied': constraint_results['all_satisfied'],
            'constraint_violations': constraint_results['violations'],
            'total_violation': constraint_results['total_violation']
        }

        return observation, reward, terminated, truncated, info

    def _check_done(self):
        """
        Modified to handle both fixed and variable episode lengths.
        """
        # For fixed episodes, check step count
        if self.episode_length is not None and self.current_step >= self.episode_length:
            return True

        # For all episodes, stop if we've reached the test data boundary
        if self.snapshot_idx >= self.train_snapshots:
            return True

        return False

    def seed(self, seed=None):
        """
        Set the random seed for reproducible experiments.
        """
        np.random.seed(seed)
        return [seed]

    def render(self, mode='human', info=None):
        """
        Render the environment state.

        Parameters:
        -----------
        mode : str
            Rendering mode (only 'human' supported)
        info : dict, optional
            Information dictionary from step() method containing constraint data
        """
        print("=== Current Network State ===")
        print(f"Episode step: {self.current_step}/{self.episode_length}")
        print(f"Snapshot index: {self.snapshot_idx}/{self.total_snapshots}")
        print(f"Current snapshot: {self.network.snapshots[self.snapshot_idx]}")
        print(f"Generator setpoints: {self.network.generators_t.p_set.iloc[self.snapshot_idx].values}")
        print(f"Load values: {self.network.loads_t.p_set.iloc[self.snapshot_idx].values}")

        all_satisfied = info['constraints_satisfied']
        total_violation = info['total_violation']
        violations = info['constraint_violations']


        print(f"All constraints satisfied: {all_satisfied}")
        print(f"Total constraint violation: {total_violation:.4f}")

        # Show violated constraints if any
        if not all_satisfied and violations:
            print("\n=== Constraint Violations ===")
            for constraint_name, violation in violations.items():
                print(f"  {constraint_name}: {violation:.4f}")

# network_file_path= "/Users/antoniagrindrod/Documents/pypsa-earth_project/pypsa-earth-RL/networks/elec_s_10_ec_lc1.0_1h.nc"
# input_dir="/Users/antoniagrindrod/Documents/pypsa-earth_project/pypsa-earth-RL/RL/var_constraint_map"
# replacement_reward_offset=calculate_offset_k_initialization(network_file=network_file_path, input_dir=input_dir)

In [7]:
def evaluate_agent_on_test_data(env, agent, optimal_objective_value):
    """
    Evaluate trained agent on test data and compute MAPE and constraint violations.

    Parameters:
    -----------
    env : EnvDispatchConstr
        The environment with train/test split
    agent : trained RL agent
        Agent with a method to get actions from observations
    optimal_objective_value : float
        The optimal objective value from pypsa.optimize() on test snapshots

    Returns:
    --------
    dict : Evaluation metrics including MAPE and constraint violation percentage
    """

    # Reset environment to test data start
    obs, info = env.reset_for_testing()

    # Track metrics
    rl_total_reward = 0.0
    total_violations = 0
    total_constraint_checks = 0
    violation_snapshots = 0
    total_test_snapshots = env.test_snapshots

    # Run agent on all test snapshots
    for step in range(total_test_snapshots):
        # Get action from agent
        if hasattr(agent, 'get_action'):
            action = agent.get_action(obs)
        elif hasattr(agent, 'predict'):
            action = agent.predict(obs)
        else:
            # Assume agent is callable
            action = agent(obs)

        # Take step in environment
        obs, reward, terminated, truncated, info = env.step(action)

        # Calculate base reward (without offset k for replacement method)
        base_reward = env._calculate_reward()
        rl_total_reward += base_reward

        # Track constraint violations for this snapshot
        snapshot_violations = info['constraint_violations']
        snapshot_violation_count = len(snapshot_violations)

        if snapshot_violation_count > 0:
            violation_snapshots += 1

        # The constraint violations in info already correspond to the processed snapshot
        # Count total constraint checks and violations based on network structure
        slack_generators = env.network.generators[env.network.generators.control == "Slack"].index
        slack_constraint_count = len(slack_generators) * 2  # upper and lower bounds
        line_constraint_count = len(env.network.lines)

        snapshot_total_constraints = slack_constraint_count + line_constraint_count
        total_constraint_checks += snapshot_total_constraints
        total_violations += len(snapshot_violations)

        if terminated or truncated:
            break

    # Calculate MAPE
    # MAPE = |RL_objective - Optimal_objective| / |Optimal_objective| * 100%
    mape = abs(rl_total_reward - optimal_objective_value) / abs(optimal_objective_value) * 100.0

    # Calculate constraint violation percentage
    # This is the percentage of constraint checks that resulted in violations
    constraint_violation_percentage = (total_violations / total_constraint_checks) * 100.0 if total_constraint_checks > 0 else 0.0

    # Also calculate percentage of snapshots with violations
    snapshot_violation_percentage = (violation_snapshots / total_test_snapshots) * 100.0

    results = {
        'mape': mape,
        'rl_total_objective': rl_total_reward,
        'optimal_total_objective': optimal_objective_value,
        'constraint_violation_percentage': constraint_violation_percentage,
        'snapshot_violation_percentage': snapshot_violation_percentage,
        'total_violations': total_violations,
        'total_constraint_checks': total_constraint_checks,
        'violation_snapshots': violation_snapshots,
        'total_test_snapshots': total_test_snapshots
    }

    return results


In [8]:
class BackboneNetwork(nn.Module):
    def __init__(self, input_features, hidden_dimensions, out_features, dropout):
        super(BackboneNetwork, self).__init__()

        # SIMPLIFIED: Single hidden layer network for debugging
        self.neuralnet = nn.Sequential(
            nn.Linear(input_features, hidden_dimensions),
            nn.ReLU(),
            nn.Linear(hidden_dimensions, hidden_dimensions),
            nn.ReLU(),
            nn.Linear(hidden_dimensions, out_features)
        )

    def forward(self, x):
        output = self.neuralnet(x)
        return output

#Define the actor-critic network
class actorCritic(nn.Module):
    def __init__(self, actor, critic):
        super().__init__()
        self.actor = actor
        self.critic = critic
    def forward(self, state):
        action_pred = self.actor(state)
        value_pred = self.critic(state)
        return action_pred, value_pred
        #Returns both the action predictions and the value predictions.

#We'll use the networks defined above to create an actor and a critic. Then, we will create an agent, including the actor and the critic.
#finish this step later
# def create_agent(hidden_dimensions, dropout):
#     INPUT_FEATURES =env_train.
class PPO_agent:
    def __init__(self,
                 env,
                 device,
                 run,
                 hidden_dimensions,
                 dropout, discount_factor,
                 max_episodes,
                 print_interval,
                 PPO_steps,
                 n_trials,
                 epsilon,
                 entropy_coefficient,
                 learning_rate,
                 batch_size,
                 optimizer_name,
                 seed):

        self.seed = seed
        if seed is not None:
            # Set PyTorch seed for this class
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed(seed)

        self.env = env  # Store the environment as an attribute

        self.device = device
        self.run = run

        # Get observation and action space dimensions for gymnasium environment
        obs, _ = self.env.reset()

        self.INPUT_FEATURES = obs.shape[0]  # Flattened observation size
        self.ACTOR_OUTPUT_FEATURES = self.env.action_space.shape[0]* 2  # 2 parameters (alpha, beta) per action dimension

        self.HIDDEN_DIMENSIONS = hidden_dimensions

        self.CRITIC_OUTPUT_FEATURES = 1
        self.DROPOUT = dropout

        self.discount_factor = discount_factor
        self.max_episodes = max_episodes
        self.print_interval = print_interval
        self.PPO_steps=PPO_steps
        self.n_trials=n_trials
        self.epsilon=epsilon
        self.entropy_coefficient=entropy_coefficient
        self.learning_rate=learning_rate

        self.batch_size=batch_size

        # Initialize actor network
        self.actor = BackboneNetwork(
            self.INPUT_FEATURES, self.HIDDEN_DIMENSIONS, self.ACTOR_OUTPUT_FEATURES, self.DROPOUT
        ).to(self.device)

        # Initialize the final layer bias for Beta distribution
        for name, param in self.actor.named_parameters():
            if 'neuralnet.4.bias' in name:  # Adjust index based on your network structure
                # Initialize to produce alpha=beta=2 (uniform-like distribution centered at 0.5)
                param.data.fill_(0.0)  # softplus(0) + 1 = 2
                print(f"Initialized Beta parameters to produce uniform-like distribution")

        # Initialize critic network
        self.critic = BackboneNetwork(
            self.INPUT_FEATURES, self.HIDDEN_DIMENSIONS, self.CRITIC_OUTPUT_FEATURES, self.DROPOUT
        ).to(self.device)

        #Better move the .to(self.device) call separately for both self.actor and self.critic. This ensures the individual parts of the model are moved to the correct device before combined into the actorCritic class
        # Combine into a single actor-critic model
        self.model = actorCritic(self.actor, self.critic)

        try:
            # Try to get the optimizer from torch.optim based on the provided name
            self.optimizer = getattr(torch.optim, optimizer_name)(self.model.parameters(), lr=self.learning_rate)
        except AttributeError:
            # Raise an error if the optimizer_name is not valid
            raise ValueError(f"Optimizer '{optimizer_name}' is not available in torch.optim.")

    def calculate_returns(self, rewards):
        returns = []
        cumulative_reward = 0
        for r in reversed(rewards):
            cumulative_reward = r +cumulative_reward*self.discount_factor
            returns.insert(0, cumulative_reward)
        returns = torch.tensor(returns).to(self.device)

        # Only normalize if we have more than one element to avoid std() warning
        if returns.numel() > 1:
            epsilon = 1e-8  # Small constant to avoid division by zero
            returns_std = returns.std()
            if not torch.isnan(returns_std) and returns_std >= epsilon:
                returns = (returns - returns.mean()) / (returns_std + epsilon)

        #I had conceptual trouble with normalizing the reward by an average, because it seemed to me since we're adding more rewards for earlier timesteps, the cumulative reward for earlier times would be a lot larger. But need to consider dicount facotr.
        # Future rewards contribute significantly to the cumulative return, so earlier timesteps will likely have larger returns.
        #if gamma is close to 0, future rewards have little influence, and the return at each timestep will closely resemble the immediate reward, meaning the pattern might not be as clear.
        return returns

    #The advantage is calculated as the difference between the value predicted by the critic and the expected return from the actions chosen by the actor according to the policy.
    def calculate_advantages(self, returns, values):
        advantages = returns - values

        # Only normalize if we have more than one element to avoid std() warning
        if advantages.numel() > 1:
            epsilon = 1e-8
            advantages_std = advantages.std()
            if not torch.isnan(advantages_std) and advantages_std >= epsilon:
                advantages = (advantages - advantages.mean()) / (advantages_std + epsilon)

        return advantages

    #The standard policy gradient loss is calculated as the product of the policy action probabilities and the advantage function
    #The standard policy gradietn loss cannot make corrections for abrupt policy changes. The surrogate loss modifies the standard loss to restrict the amount the policy can change in each iteration.
    #The surrogate loss is the minimum of (policy ratio X advantage function) and (clipped value of policy ratio X advantage function) where the policy ratio is between the action probabilities according to the old versus new policies and clipping restricts the value to a region near 1.

    def calculate_surrogate_loss(self, actions_log_probability_old, actions_log_probability_new, advantages):
        advantages = advantages.detach()
        # creates a new tensor that shares the same underlying data as the original tensor but breaks the computation graph. This means:
        # The new tensor is treated as a constant with no gradients.
        # Any operations involving this tensor do not affect the gradients of earlier computations in the graph.

        #If the advantages are not detached, the backpropagation of the loss computed using the surrogate_loss would affect both the actor and the critic networks
        # The surrogate loss is meant to update only the policy (actor).
        # Allowing gradients to flow back through the advantages would inadvertently update the critic, potentially disrupting its learning process.

        policy_ratio  = (actions_log_probability_new - actions_log_probability_old).exp()
        surrogate_loss_1 = policy_ratio*advantages
        surrogate_loss_2 = torch.clamp(policy_ratio, min =1.0-self.epsilon, max = 1.0+self.epsilon)*advantages
        surrogate_loss=torch.min(surrogate_loss_1, surrogate_loss_2)
        return surrogate_loss

    #TRAINING THE AGENT
    #Policy loss is the sum of the surrogate loss and the entropy bonus. It is used to update the actor (policy network)
    #Value loss is based on the difference between the value predicted by the critic and the returns (cumulative reward) generated by the policy. This loss is used to update the critic (value network) to make predictions more accurate.

    def calculate_losses(self, surrogate_loss, entropy, returns, value_pred):
        entropy_bonus = self.entropy_coefficient*entropy
        policy_loss = -(surrogate_loss+entropy_bonus).sum()
        value_loss = torch.nn.functional.smooth_l1_loss(returns, value_pred).sum() #helps to smoothen the loss function and makes it less sensitive to outliers.
        return policy_loss, value_loss

    def init_training(self):
        #create a set of buffers as empty arrays. To be used during training to store information
        states = []
        actions = []
        actions_log_probability = []
        values = []
        rewards = []
        done = False
        episode_reward = 0
        return states, actions, actions_log_probability, values, rewards, done, episode_reward

    def forward_pass(self):#this is just the training function (might just want to rename it)
        # # === DETAILED OBJECT ANALYSIS ===
        # import psutil
        # import gc

        # if not hasattr(self, '_episode_counter'):
        #     self._episode_counter = 0
        # self._episode_counter += 1

        # mem_mb = psutil.Process().memory_info().rss / 1024 / 1024

        # # Get ALL objects with "Network" in their type name
        # network_objects = [obj for obj in gc.get_objects() if 'network' in str(type(obj)).lower()]

        # print(f"\n=== EPISODE {self._episode_counter} OBJECT ANALYSIS ===")
        # print(f"Memory: {mem_mb:.1f}MB")
        # print(f"Total objects with 'network' in type: {len(network_objects)}")

        # # Count by exact type
        # type_counts = {}
        # for obj in network_objects:
        #     obj_type = str(type(obj))
        #     type_counts[obj_type] = type_counts.get(obj_type, 0) + 1

        # # Print breakdown
        # for obj_type, count in type_counts.items():
        #     print(f"  {obj_type}: {count}")

        # # Show actual PyPSA Network objects specifically
        # actual_networks = [obj for obj in gc.get_objects() if type(obj).__name__ == 'Network' and 'pypsa' in str(type(obj))]
        # print(f"Actual PyPSA Network objects: {len(actual_networks)}")

        # if len(actual_networks) <= 5:  # Only print if reasonable number
        #     for i, net in enumerate(actual_networks):
        #         print(f"  Network {i+1}: {id(net)} - {type(net)}")

        # network_id = id(self.env.network) if hasattr(self.env, 'network') else None
        # print(f"Current env.network ID: {network_id}")
        # print("=" * 50)
        # # === END ANALYSIS ===

        # Reset environment with seed
        if self.seed is not None:
            state, _ = self.env.reset(seed=self.seed)
        else:
            state, _ = self.env.reset()

        states, actions, actions_log_probability, values, rewards, done, episode_reward = self.init_training()

        # Add this line to track violations
        total_violations = 0

        # # Create fresh network for each episode to avoid memory corruption
        # fresh_network = create_pypsa_network()
        # self.env.network = fresh_network

        state, _ = self.env.reset()  # Gymnasium format returns (obs, info)

        self.model.train() # Set model to training mode

        while True:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            states.append(state_tensor)

            # Get action predictions and values
            action_mean, value_pred = self.model(state_tensor)



            # Split actor output into alpha and beta parameters
            action_dim = self.env.action_space.shape[0]
            alpha_raw, beta_raw = torch.split(action_mean, action_dim, dim=-1)

            # Ensure alpha, beta > 1 for well-behaved Beta distribution
            alpha = torch.nn.functional.softplus(alpha_raw) + 1.0
            beta = torch.nn.functional.softplus(beta_raw) + 1.0

            # Create Beta distribution for continuous actions in [0,1]
            dist = torch.distributions.Beta(alpha, beta)
            action = dist.sample()

            # No clamping needed - Beta distribution naturally outputs [0,1]
            action_clamped = action

            log_prob_action = dist.log_prob(action).sum(dim=-1)  # Sum over action dimensions

            # Step environment with numpy action
            action_np = action_clamped.detach().cpu().numpy().flatten()
            state, reward, terminated, truncated, info = self.env.step(action_np)
            done = terminated or truncated

            #accumulate violations for the epsiode
            total_violations += sum(info['constraint_violations'].values())

            actions.append(action_clamped)
            actions_log_probability.append(log_prob_action)
            values.append(value_pred)
            rewards.append(reward)
            episode_reward += reward

            if done:
                break

        states=torch.cat(states).to(self.device)#converts the list of individual states into a sinlem tensor that is necessary for later processing
        #Creates a single tensor with dimensions like (N, state_dim), where: N is the number of states collected in the episode; state_dim is the dimensionality of each state.
        #torch.cat() expects a sequence (e.g. list or tuple) of PyTorch tensors as input.
        actions=torch.cat(actions).to(self.device)
        #Note that, in the loop, both state and action are PyTorch tensors so that states and actions are both lists of PyTorch tensors
        actions_log_probability=torch.cat(actions_log_probability).to(self.device)
        values=torch.cat(values).squeeze(-1).to(self.device)# .squeeze removes a dimension of size 1 only from tensor at the specified position, in this case, -1, the last dimesion in the tensor. Note that .squeeze() does not do anything if the size of the dimension at the specified potision is not 1.
        # print(f"rewards NaNs: {torch.isnan(torch.tensor(rewards, dtype=torch.float32)).any()}")
        # print(f"values NaNs: {torch.isnan(torch.tensor(values, dtype=torch.float32)).any()}")
        returns = self.calculate_returns(rewards)
        advantages = self.calculate_advantages(returns, values)

        # print(f"Returns NaNs: {torch.isnan(returns).any()}")
        # print(f"advantages NaNs (after calculation): {torch.isnan(advantages).any()}")

        return episode_reward, states, actions, actions_log_probability, advantages, returns, total_violations


    def update_policy(self,
            states,
            actions,
            actions_log_probability_old,
            advantages,
            returns):
        #print(f"Returns NaNs: {torch.isnan(returns).any()}")
        total_policy_loss = 0
        total_value_loss = 0
        actions_log_probability_old = actions_log_probability_old.detach()
        actions=actions.detach()

        # print(f"Returns NaNs: {torch.isnan(returns).any()}")
        # print(f"advantages NaNs (after calculation): {torch.isnan(advantages).any()}")


        #detach() is used to remove the tensor from the computation graph, meaning no gradients will be calculated for that tensor when performing backpropagation.
        #In this context, it's used to ensure that the old actions and log probabilities do not participate in the gradient computation during the optimization of the policy, as we want to update the model based on the current policy rather than the old one.
        #print(type(states), type(actions),type(actions_log_probability_old), type(advantages), type(returns))
        training_results_dataset= TensorDataset(
                states,
                actions,
                actions_log_probability_old,
                advantages,
                returns) #TensorDataset class expects all the arguments passed to it to be tensors (or other compatible types like NumPy arrays, which will be automatically converted to tensor
        batch_dataset = DataLoader(
                training_results_dataset,
                batch_size=self.batch_size,
                shuffle=False)
        #creates a DataLoader instance in PyTorch, which is used to load the training_results_dataset in batches during training.
        #batch_size defines how many samples will be included in each batch. The dataset will be divided into batches of size BATCH_SIZE. The model will then process one batch at a time, rather than all of the data at once,
        #shuffle argument controls whether or not the data will be shuffled before being split into batches.
        #Because shuffle is false, dataloader will provide the batches in the order the data appears in training_results_dataset. In this case, the batches will be formed from consecutive entries in the dataset, and the observations will appear in the same sequence as they are stored in the dataset.
        for _ in range(self.PPO_steps):
            for batch_idx, (states,actions,actions_log_probability_old, advantages, returns) in enumerate(batch_dataset):
                #get new log prob of actions for all input states
                action_mean, value_pred = self.model(states)
                value_pred = value_pred.squeeze(-1)

                # For continuous actions with Beta distribution
                action_dim = self.env.action_space.shape[0]
                alpha_raw, beta_raw = torch.split(action_mean, action_dim, dim=-1)

                # Ensure alpha, beta > 1 for well-behaved Beta distribution
                alpha = torch.nn.functional.softplus(alpha_raw) + 1.0
                beta = torch.nn.functional.softplus(beta_raw) + 1.0

                probability_distribution_new = torch.distributions.Beta(alpha, beta)
                entropy = probability_distribution_new.entropy().sum(dim=-1)

                #estimate new log probabilities using old actions
                actions_log_probability_new = probability_distribution_new.log_prob(actions).sum(dim=-1)
                # # Check for NaN or Inf in log probabilities
                # if torch.isnan(actions_log_probability_old).any() or torch.isinf(actions_log_probability_old).any():
                #     print("NaN or Inf detected in actions_log_probability_old!")
                #     return  # You can return or handle this case as needed

                # if torch.isnan(actions_log_probability_new).any() or torch.isinf(actions_log_probability_new).any():
                #     print("NaN or Inf detected in actions_log_probability_new!")
                #     return  # You can return or handle this case as needed

                # print(f"actions_log_probability_old NaNs: {torch.isnan(actions_log_probability_old).any()}")
                # print(f"actions_log_probability_new NaNs: {torch.isnan(actions_log_probability_new).any()}")
                # print(f"advantages NaNs: {torch.isnan(advantages).any()}")

                surrogate_loss = self.calculate_surrogate_loss(
                    actions_log_probability_old,
                    actions_log_probability_new,
                    advantages
                )

                # print(f"Surrogate Loss NaNs: {torch.isnan(surrogate_loss).any()}")
                # print(f"Entropy NaNs: {torch.isnan(entropy).any()}")
                # print(f"Returns NaNs: {torch.isnan(returns).any()}")
                # print(f"Value Predictions NaNs: {torch.isnan(value_pred).any()}")

                policy_loss, value_loss = self.calculate_losses(
                    surrogate_loss,
                    entropy,
                    returns,
                    value_pred
                )
                self.optimizer.zero_grad() #clear existing gradietns in the optimizer (so that these don't propagate accross multiple .backward(). Ensures each optimization step uses only the gradients computed during the current batch.

                # Skip backward pass if loss is NaN
                if torch.isnan(policy_loss).any():
                    print("NaN detected in policy_loss - skipping backward pass!")
                    continue
                if torch.isnan(value_loss).any():
                    print("NaN detected in value_loss - skipping backward pass!")
                    continue

                policy_loss.backward() #computes gradients for policy_loss with respect to the agent's parameters
                # #Check for NaN gradients after policy_loss backward
                # for param in self.model.parameters():
                #     if param.grad is not None:  # Check if gradients exist for this parameter
                #         if torch.isnan(param.grad).any():
                #             print("NaN gradient detected in policy_loss!")
                # #             return
                value_loss.backward()
                # Check for NaN gradients after value_loss backwardor param in self.model.parameters():
                # for param in self.model.parameters():
                #     if param.grad is not None:  # Check if gradients exist for this parameter
                #         if torch.isnan(param.grad).any():
                #             print("NaN gradient detected in value_loss!")
                #             return

                self.optimizer.step()
                #The update step is based on the learning rate and other hyperparameters of the optimizer
                # The parameters of the agent are adjusted to reduce the policy and value losses.
                total_policy_loss += policy_loss.item() #accumulate the scalar value of the policy loss for logging/ analysis
                #policy_loss.item() extracts the numerical value of the loss tensor (detaching it from the computational graph).
                #This value is added to total_policy_loss to compute the cumulative loss over all batches in the current PPO step.
                #Result: tracks the total policy loss for the current training epoch
                # The loss over the whole dataset is the sum of the losses over all batches.
                #The training dataset is split into batches during the training process. Each batch represents a subset of the collected training data from one episode.
                # Loss calculation is performed for each batch (policy loss and value loss)
                # for each batch, gradients are calculated with respect to the total loss for that batch and the optimizer then updates the network parameters using these gradients.
                # this is because the surrogate loss is only calculated over a single batch of data
                #look at the formula for surrogate loss.
                # It is written in terms of an expectation ˆ Et[. . .] that indicates the empirical average over a finite batch of samples.
                # This means you have collected a set of data (time steps) from the environment, and you're averaging over these data points. The hat symbol implies you're approximating the true expectation with a finite sample of data from the environment. This empirical average can be computed as the mean of values from the sampled transitions
                # the expectation is taken over all the data you've collected
                #If you're training with multiple batches (i.e., collecting data in chunks), then you can think of the expectation as being computed over each batch.
                #The overall expectation can indeed be seen as the sum of expectations computed for each batch, but The expectation of the sum is generally not exactly equal to the sum of the expectations unless the samples are independent, but in practical reinforcement learning algorithms, it's typically a good enough approximation
                #For samples to be independent, the outcome of one sample must not provide any information about the outcome of another. Specifically, in the context of reinforcement learning, this means that the states, actions, rewards, and subsequent states observed in different time steps or different episodes should be independent of each other.
                total_value_loss += value_loss.item()
                #Notice that we are calculating an empirical average, which is already an approximation on the true value (the true expectation would be the average over an infinite amount of data, and the empirical average is the average over the finite amount of data that we have collected).
                #But furthermore, we are approximating even the empirical average istelf. The empirical average is the average over all our collected datal, but here we actually batch our data, calculate average over each batch and then sum these averages, which is not exaclty equal to the average of the sums (but is a decent approximation).
        return total_policy_loss / self.PPO_steps, total_value_loss / self.PPO_steps

    def train(self):
        train_rewards = []
        # test_rewards = []
        # policy_losses = []
        # value_losses = []
        #lens = []

        for episode in range(1, self.max_episodes + 1):
            # Perform a forward pass and collect experience
            train_reward, states, actions, actions_log_probability, advantages, returns, violations = self.forward_pass()

            # Update the policy using the experience collected
            policy_loss, value_loss = self.update_policy(
                states,
                actions,
                actions_log_probability,
                advantages,
                returns)
            # test_reward = self.evaluate()

            # # Visualize the environment if it supports rendering (currently this is done once each episode - might want to change to once every multiple of episodes)
            # if hasattr(self.env, "render") and callable(getattr(self.env, "render", None)):
            #   self.env.render()

            # Log the results
            # policy_losses.append(policy_loss)
            # value_losses.append(value_loss)
            train_rewards.append(train_reward)
            # # run these when back online
            # self.run["policy_loss"].log(policy_loss)
            # self.run["value_loss"].log(value_loss)
            self.run["train_reward"].log(train_reward)
            self.run["total_violation"].log(violations)

            # Calculate the mean of recent rewards and losses for display
            mean_train_rewards = np.mean(train_rewards[-self.n_trials:])
            #mean_test_rewards = np.mean(test_rewards[-self.n_trials:])
            # mean_abs_policy_loss = np.mean(np.abs(policy_losses[-self.n_trials:]))
            # mean_abs_value_loss = np.mean(np.abs(value_losses[-self.n_trials:]))

            # Print results at specified intervals
            if episode % self.print_interval == 0:
                print(f'Episode: {episode:3} | \
                    Train Rewards: {train_reward:3.1f} \
                    Violations: {violations}\
                    Mean Train Rewards: {mean_train_rewards:3.1f}' )
                    # \
                    # | Mean Abs Policy Loss: {mean_abs_policy_loss:2.2f} \
                    # | Mean Abs Value Loss: {mean_abs_value_loss:2.2f} ')



                                    # | Mean Test Rewards: {mean_test_rewards:3.1f} \
                                    #| "Episode Len: {np.mean(lens[-self.n_trials:])}



            # # Check if reward threshold is reached
            # if mean_test_rewards >= self.reward_threshold:
            #     print(f'Reached reward threshold in {episode} episodes')
            #     break
        # Check if the environment has a close method before calling it
        # if hasattr(self.env, "close") and callable(getattr(self.env, "close", None)):
        #   self.env.close() #Close environment visualisation after training is done.
        return train_rewards

def plot_train_rewards(train_rewards, reward_threshold):
    plt.figure(figsize=(12, 8))
    plt.plot(train_rewards, label='Training Reward')
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Training Reward', fontsize=20)
    plt.hlines(reward_threshold, 0, len(train_rewards), color='y')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

def plot_test_rewards(test_rewards, reward_threshold):
    plt.figure(figsize=(12, 8))
    plt.plot(test_rewards, label='Testing Reward')
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Testing Reward', fontsize=20)
    plt.hlines(reward_threshold, 0, len(test_rewards), color='y')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

def plot_losses(policy_losses, value_losses):
    plt.figure(figsize=(12, 8))
    plt.plot(value_losses, label='Value Losses')
    plt.plot(policy_losses, label='Policy Losses')
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Loss', fontsize=20)
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

In [9]:
import ipdb
class EnvDispatchReplacement(EnvDispatchConstr):
    """
    Environment using the Replacement reward method instead of Summation.

    Inherits from Env2Gen1LoadConstr but modifies the reward calculation
    to implement the replacement method from the RL-OPF paper.
    """

    def __init__(self,network_file, constraint_penalty_factor=100, offset_k=2500, test_start_date='2013-12-01 00:00:00',
                 fixed_episode_length=None):
        """
        Initialize the replacement reward environment.

        Parameters:
        -----------
        network_file : str
            Path to the PyPSA network file
        episode_length : int, optional
            Length of episodes (defaults to total snapshots)
        constraint_penalty_factor : float
            Penalty factor for constraint violations
        offset_k : float
            Offset value for replacement reward method
        test_start_date : str
            Start date for test period (everything from this date onwards is test data)
        fixed_episode_length : int, optional
            Fixed episode length if specified, otherwise episodes are variable
        """
        # Call parent constructor - this will initialize all base attributes
        super().__init__(
            network_file=network_file,
            constraint_penalty_factor=constraint_penalty_factor,
            test_start_date=test_start_date,
            fixed_episode_length=fixed_episode_length
        )

        # Add replacement-specific attributes
        self.offset_k = offset_k
        self.reward_method = "replacement"

        # Store initialization parameters for offset calculation
        self.k_method = "mean"  # Default method for k calculation
        self.k_samples = 1000  # Default number of samples for k calculation

    def calculate_constrained_reward(self):
        """
        Calculate reward using replacement method with dynamic constraint checking.

        Replacement method:
        - If all constraints satisfied: return -J(s) + k
        - If constraints violated: return -P(s)
        """
        try:
            # Get base reward from objective function (negative for minimization)
            base_reward = self._calculate_reward()
            # Get constraint violations using the same logic as the parent class
            # But we need to implement it here directly instead of calling super()
            current_snapshot = self.network.snapshots[self.snapshot_idx]

            # Initialize constraint tracking
            constraint_results = {
                'all_satisfied': True,
                'violations': {},
                'total_violation': 0.0,
                'violations_by_group': {}
            }

            # 1. Check slack generator constraints
            slack_generators = self.network.generators[self.network.generators.control == "Slack"].index
            if not slack_generators.empty:
                for gen_name in slack_generators:
                    # Get actual power output after power flow
                    p_actual = self.network.generators_t.p.loc[current_snapshot, gen_name]

                    # Get limits
                    p_min = self.network.generators.loc[gen_name, 'p_min_pu'] * self.network.generators.loc[gen_name, 'p_nom']
                    p_max = self.network.generators.loc[gen_name, 'p_max_pu'] * self.network.generators.loc[gen_name, 'p_nom']

                    # Check lower bound
                    if p_actual < p_min:
                        violation = float(p_min - p_actual)
                        constraint_name = f"Generator-slack-p-lower[snapshot={current_snapshot},Generator={gen_name}]"
                        constraint_results['violations'][constraint_name] = violation
                        constraint_results['total_violation'] += violation
                        constraint_results['all_satisfied'] = False

                    # Check upper bound
                    if p_actual > p_max:
                        violation = float(p_actual - p_max)
                        constraint_name = f"Generator-slack-p-upper[snapshot={current_snapshot},Generator={gen_name}]"
                        constraint_results['violations'][constraint_name] = violation
                        constraint_results['total_violation'] += violation
                        constraint_results['all_satisfied'] = False

            # 2. Check line flow constraints (CORRECTED)
            for line_name in self.network.lines.index:
                # Get line parameters
                s_nom = self.network.lines.loc[line_name, 's_nom']
                s_max_pu = 1.0  # Default, or get from lines_t.s_max_pu if it exists

                # Calculate active power limit (this is what PyPSA's linear constraints check)
                s_max = s_max_pu * s_nom

                # Get active power flow from the linear power flow
                # In PyPSA's linear formulation, this is the 's' variable value
                p0 = abs(self.network.lines_t.p0.loc[current_snapshot, line_name])

                # Check if active power flow exceeds limit
                if p0 > s_max:
                    violation = float(p0 - s_max)
                    constraint_name = f"Line-fix-s-upper[snapshot={current_snapshot},Line={line_name}]"
                    constraint_results['violations'][constraint_name] = violation
                    constraint_results['total_violation'] += violation
                    constraint_results['all_satisfied'] = False

            # Apply replacement method
            if constraint_results['all_satisfied']:
                # All constraints satisfied: return optimization reward + offset k
                constrained_reward = base_reward + self.offset_k
            else:
                # Constraints violated: return only penalty (negative)
                total_violation = float(constraint_results['total_violation'])
                constrained_reward = -self.penalty_factor * total_violation

            # Ensure reward is a scalar
            if hasattr(constrained_reward, '__len__'):
                constrained_reward = float(constrained_reward)

            return constrained_reward, constraint_results

        except Exception as e:
            print(f"Error calculating replacement reward: {e}")
            # Fall back to base reward on error
            return self._calculate_reward(), {
                'all_satisfied': True,
                'violations': {},
                'total_violation': 0.0
            }

    def get_reward_method_info(self):
        """
        Get information about the reward method being used.

        Returns:
        --------
        dict: Information about the reward method
        """
        return {
            'method': 'replacement',
            'offset_k': self.offset_k,
            'k_method': self.k_method,
            'k_samples': self.k_samples,
            'penalty_factor': self.penalty_factor,
            'train_snapshots': self.train_snapshots,
            'test_snapshots': self.test_snapshots,
            'test_start_date': str(self.test_start_date),
            'fixed_episode_length': self.fixed_episode_length
        }

# network_file_path= "/Users/antoniagrindrod/Documents/pypsa-earth_project/pypsa-earth-RL/networks/elec_s_10_ec_lc1.0_1h.nc"
# input_dir="/Users/antoniagrindrod/Documents/pypsa-earth_project/pypsa-earth-RL/RL/var_constraint_map"
# replacement_reward_offset=calculate_offset_k_initialization(network_file=network_file_path, input_dir=input_dir)

In [10]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Simple Sweep Tracker - Only tracks successful completions
import json
import os
from datetime import datetime
import pandas as pd

class SimpleSweepTracker:
    def __init__(self, base_dir='/content/drive/MyDrive/Colab_Notebooks/sweep_results'):
        self.base_dir = base_dir
        self.config_file = os.path.join(base_dir, 'all_configs.json')
        self.completed_file = os.path.join(base_dir, 'completed_runs.csv')

        os.makedirs(base_dir, exist_ok=True)

    def setup_configs(self):
        """Generate and save all configurations - run once"""
        configs = self._generate_all_configs()

        with open(self.config_file, 'w') as f:
            json.dump(configs, f, indent=2)

        print(f"✓ Created {len(configs)} configurations")
        print(f"✓ Saved to: {self.config_file}")
        return len(configs)

    def _generate_all_configs(self):
        """Generate configs using sweep_params like your original code"""
        base_params = {
            "optimizer_name": "Adam",
            "MAX_EPISODES": 10000,
            "PRINT_INTERVAL": 100,
            "N_TRIALS": 8,
            "DROPOUT": 0,
            "network_file": "elec_s_10_ec_lc1.0_1h.nc",
            "optimization_result_file": "elec_s_10_ec_lc1.0_1h_Test_Objective.txt"
        }

        # Parameters to sweep (from your original code)
        sweep_params = {
            "LEARNING_RATE": [1e-4, 3e-4, 1e-3, 3e-3],
            "EPSILON": [0.1, 0.2, 0.3],
            "ENTROPY_COEFFICIENT": [0.01, 0.05, 0.1],
            "HIDDEN_DIMENSIONS": [32, 64, 128],
            "PPO_STEPS": [8, 16],
            "BATCH_SIZE": [128, 256],
            "DISCOUNT_FACTOR": [0.95, 0.99],
            "constraint_penalty_factor": [0, 25, 50, 100],
            "env_class": ["EnvDispatchConstr", "EnvDispatchReplacement"]
        }

        # Your priority configurations (these are good combinations to try first)
        priority_configs = [
            {"LEARNING_RATE": 1e-3, "EPSILON": 0.3, "ENTROPY_COEFFICIENT": 0.1,
             "HIDDEN_DIMENSIONS": 64, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 0, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 3e-4, "EPSILON": 0.2, "ENTROPY_COEFFICIENT": 0.05,
             "HIDDEN_DIMENSIONS": 64, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 0, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 1e-4, "EPSILON": 0.1, "ENTROPY_COEFFICIENT": 0.01,
             "HIDDEN_DIMENSIONS": 128, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 0, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 3e-3, "EPSILON": 0.3, "ENTROPY_COEFFICIENT": 0.1,
             "HIDDEN_DIMENSIONS": 32, "PPO_STEPS": 8, "BATCH_SIZE": 128,
             "DISCOUNT_FACTOR": 0.95, "constraint_penalty_factor": 0, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 1e-3, "EPSILON": 0.3, "ENTROPY_COEFFICIENT": 0.1,
             "HIDDEN_DIMENSIONS": 64, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 100, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 3e-4, "EPSILON": 0.2, "ENTROPY_COEFFICIENT": 0.05,
             "HIDDEN_DIMENSIONS": 64, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 100, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 1e-4, "EPSILON": 0.1, "ENTROPY_COEFFICIENT": 0.01,
             "HIDDEN_DIMENSIONS": 128, "PPO_STEPS": 16, "BATCH_SIZE": 256,
             "DISCOUNT_FACTOR": 0.99, "constraint_penalty_factor": 100, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
            {"LEARNING_RATE": 3e-3, "EPSILON": 0.3, "ENTROPY_COEFFICIENT": 0.1,
             "HIDDEN_DIMENSIONS": 32, "PPO_STEPS": 8, "BATCH_SIZE": 128,
             "DISCOUNT_FACTOR": 0.95, "constraint_penalty_factor": 100, "episode_length": 4,
             "env_class": "EnvDispatchConstr"},
        ]

        # Generate random configurations from sweep_params if you want more
        import random
        random.seed(42)  # For reproducible random configs

        num_random_configs = 5  # Add some random combinations
        random_configs = []

        for _ in range(num_random_configs):
            config = {param: random.choice(values) for param, values in sweep_params.items()}
            config["episode_length"] = 4  # Add any fixed params for random configs
            random_configs.append(config)

        # Combine priority and random configs
        all_param_configs = priority_configs + random_configs

        # Generate configs with seeds - SEED FIRST, then configs
        seeds = [42, 123, 7]
        all_configs = []
        run_id = 0

        # For each seed, go through all parameter configs
        for seed in seeds:
            for config_idx, param_config in enumerate(all_param_configs):
                run_config = {
                    **base_params,
                    **param_config,
                    "run_id": run_id,
                    "config_name": f"Config_{config_idx}_Seed_{seed}",
                    "config_idx": config_idx,
                    "seed": seed
                }
                all_configs.append(run_config)
                run_id += 1

        return all_configs

    def get_completed_runs(self):
        """Get list of successfully completed run IDs"""
        if not os.path.exists(self.completed_file):
            return []
        df = pd.read_csv(self.completed_file)
        return df['run_id'].tolist()

    def mark_completed(self, run_id, results):
        """Mark a run as completed and save results"""
        # Prepare the data
        completion_data = {
            'run_id': run_id,
            'completed_at': datetime.now().isoformat(),
            **results  # Add all the results
        }

        # Append to completed runs file
        new_row = pd.DataFrame([completion_data])

        if os.path.exists(self.completed_file):
            existing_df = pd.read_csv(self.completed_file)
            combined_df = pd.concat([existing_df, new_row], ignore_index=True)
        else:
            combined_df = new_row

        combined_df.to_csv(self.completed_file, index=False)
        print(f"✓ Marked run {run_id} as completed")

    def get_next_run(self):
        """Get the next run that hasn't been completed"""
        # Load all configs
        with open(self.config_file, 'r') as f:
            all_configs = json.load(f)

        # Get completed run IDs
        completed_ids = self.get_completed_runs()

        # Find first uncompleted run
        for config in all_configs:
            if config['run_id'] not in completed_ids:
                return config

        return None  # All done!

    def status(self):
        """Show simple status"""
        # Load configs
        with open(self.config_file, 'r') as f:
            total_configs = len(json.load(f))

        completed_ids = self.get_completed_runs()
        completed_count = len(completed_ids)
        remaining = total_configs - completed_count

        print(f"📊 Sweep Status:")
        print(f"   Total runs: {total_configs}")
        print(f"   Completed: {completed_count} ({completed_count/total_configs*100:.1f}%)")
        print(f"   Remaining: {remaining}")

        if remaining > 0:
            next_run = self.get_next_run()
            if next_run:
                print(f"   Next run: {next_run['run_id']} ({next_run['config_name']})")
        else:
            print("   🎉 All runs completed!")

        return {'total': total_configs, 'completed': completed_count, 'remaining': remaining}

    def show_recent_completions(self, n=5):
        """Show recently completed runs"""
        if not os.path.exists(tracker.completed_file):
            print("No completed runs yet")
            return

        df = pd.read_csv(tracker.completed_file)
        if len(df) == 0:
            print("No completed runs yet")
            return

        df['completed_at'] = pd.to_datetime(df['completed_at'])
        recent = df.sort_values('completed_at', ascending=False).head(n)

        print(f"🕒 Last {min(n, len(df))} completed runs:")
        for _, row in recent.iterrows():
            time_str = row['completed_at'].strftime('%m/%d %H:%M')
            print(f"   Run {row['run_id']}: {time_str}")

def run_single_experiment(run_id, tracker):
    """Run a single experiment"""
    # Get the config
    with open(tracker.config_file, 'r') as f:
        all_configs = json.load(f)

    config = next(c for c in all_configs if c['run_id'] == run_id)
    print(f"🚀 Starting run {run_id}: {config['config_name']}")

    try:
        # Your training code here (same as before)
        results = execute_training_simple(config)

        # Mark as completed
        tracker.mark_completed(run_id, results)
        print(f"✅ Run {run_id} completed successfully")
        return True

    except Exception as e:
        print(f"❌ Run {run_id} failed: {str(e)}")
        return False

def execute_training_simple(config):
    """Your training code - simplified version"""
    # Set up paths
    gdrive_base = '/content/drive/MyDrive/Colab_Notebooks/'
    network_file_path = os.path.join(gdrive_base, "networks_1_year_connected", config["network_file"])
    optimization_result_path = os.path.join(gdrive_base, "optimized_network", config["optimization_result_file"])

    # Load optimization result
    with open(optimization_result_path, 'r') as f:
        objective = float(f.read().strip())

    objective=-1*objective# multiply by negative one since comparing to reward found by RL agent

    # Set seeds
    seed = config["seed"]
    import random, numpy as np, torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Calculate offset (only needed for replacement env)
    replacement_reward_offset = None
    if config["env_class"] == "EnvDispatchReplacement":
        replacement_reward_offset = calculate_offset_k_initialization(
            network_file=network_file_path
        )

    # Initialize Neptune
    API_TOKEN = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1ODQwZjA5OS05MDFmLTQ2MWYtYWJiMi0yMDkzYmEwNzgzMzEifQ=="
    PROJECT_NAME = "EnergyGridRL/elec-s-10-ec-lc10-1h-Dispatch"
    run = neptune.init_run(
    project=PROJECT_NAME,
    api_token=API_TOKEN,
    name=config['config_name'],
    tags=["hyperparameter_sweep"])

    # Log parameters
    for key, value in config.items():
        if key not in ['run_id', 'config_name', 'config_idx']:
            run[f"parameters/{key}"] = value

    if replacement_reward_offset is not None:
        run["replacement_reward_offset"] = replacement_reward_offset

    # Create environment based on env_class
    if config["env_class"] == "EnvDispatchConstr":
        env = EnvDispatchConstr(
            network_file=network_file_path,
            constraint_penalty_factor=config["constraint_penalty_factor"]
        )
    elif config["env_class"] == "EnvDispatchReplacement":
        env = EnvDispatchReplacement(
            network_file=network_file_path,
            constraint_penalty_factor=config["constraint_penalty_factor"],
            offset_k=replacement_reward_offset
        )
    else:
        raise ValueError(f"Unknown env_class: {config['env_class']}")

    env.seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    agent = PPO_agent(
        env=env, run=run, device=device,
        hidden_dimensions=config["HIDDEN_DIMENSIONS"],
        dropout=config["DROPOUT"],
        discount_factor=config["DISCOUNT_FACTOR"],
        optimizer_name=config["optimizer_name"],
        max_episodes=config["MAX_EPISODES"],
        print_interval=config["PRINT_INTERVAL"],
        PPO_steps=config["PPO_STEPS"],
        n_trials=config["N_TRIALS"],
        epsilon=config["EPSILON"],
        entropy_coefficient=config["ENTROPY_COEFFICIENT"],
        learning_rate=config["LEARNING_RATE"],
        batch_size=config["BATCH_SIZE"],
        seed=seed
    )

    # Train
    train_rewards = agent.train()

    # Evaluate
    test_results = evaluate_agent_on_test_data(agent, env, objective)

    # Log to Neptune
    run["training_results/final_reward"] = train_rewards[-1]
    run["training_results/mean_last_100_reward"] = np.mean(train_rewards[-100:])
    run["training_results/best_reward"] = np.max(train_rewards)

    for key, value in test_results.items():
        run[f"test_results/{key}"] = value

    # Save trained model to Neptune
    save_model_to_neptune(agent, run, config)

    run.stop()

    # Return key results for local tracking
    results = {
        'env_class': config["env_class"],
        'mape': test_results.get('mape'),
        'rl_total_objective': test_results.get('rl_total_objective'),
        'optimal_total_objective': test_results.get('optimal_total_objective'),
        'constraint_violation_percentage': test_results.get('constraint_violation_percentage'),
        'final_reward': train_rewards[-1],
        'mean_last_100_reward': np.mean(train_rewards[-100:]),
        'best_reward': np.max(train_rewards),
        'neptune_run_id': run.get_run_url().split('/')[-1],  # Get Neptune run ID for model retrieval
    }

    if replacement_reward_offset is not None:
        results['replacement_reward_offset'] = replacement_reward_offset

    return results

# === SIMPLE USAGE ===

def setup_sweep():
    """Run this once to set up all configs"""
    tracker = SimpleSweepTracker()
    tracker.setup_configs()
    tracker.status()
    return tracker

def run_next():
    """Run the next pending experiment"""
    tracker = SimpleSweepTracker()
    next_config = tracker.get_next_run()

    if next_config is None:
        print("🎉 All experiments completed!")
        return False

    success = run_single_experiment(next_config['run_id'], tracker)
    tracker.status()  # Show updated status
    return success

def check_status():
    """Check current status"""
    tracker = SimpleSweepTracker()
    tracker.status()
    tracker.show_recent_completions()

def run_specific(run_id):
    """Run a specific experiment by ID"""
    tracker = SimpleSweepTracker()
    return run_single_experiment(run_id, tracker)

def resume_sweep(max_runs=5):
    """Resume running experiments"""
    tracker = SimpleSweepTracker()
    completed = 0

    while completed < max_runs:
        next_config = tracker.get_next_run()
        if next_config is None:
            print("🎉 All experiments completed!")
            break

        success = run_single_experiment(next_config['run_id'], tracker)
        if success:
            completed += 1

        tracker.status()

    print(f"Completed {completed} runs in this session")
    return completed

def save_model_to_neptune(agent, neptune_run, config):
    """Save the trained model to Neptune"""
    import tempfile
    import torch
    import json

    # Create temporary files for the model components
    with tempfile.TemporaryDirectory() as temp_dir:
        # Save actor-critic network state dict
        # Assuming agent.actor_critic exists and is the combined model
        if hasattr(agent, 'actor_critic') and agent.actor_critic is not None:
             actor_critic_path = os.path.join(temp_dir, 'actor_critic.pt')
             torch.save(agent.actor_critic.state_dict(), actor_critic_path)
        elif hasattr(agent, 'model') and agent.model is not None:
            # If using agent.model as the combined model
            actor_critic_path = os.path.join(temp_dir, 'actor_critic.pt')
            torch.save(agent.model.state_dict(), actor_critic_path)
        else:
            print("Warning: Agent does not have a combined actor_critic or model attribute to save.")
            return


        # Save complete model info (for reconstruction)
        model_info = {
            'config': config,
            'state_space_dim': agent.env.observation_space.shape[0],
            'action_space_dim': agent.env.action_space.shape[0],
            'hidden_dimensions': config['HIDDEN_DIMENSIONS'],
            'dropout': config['DROPOUT'],
            'model_architecture': str(agent.model) if hasattr(agent, 'model') and agent.model is not None else "N/A" # Architecture as string
        }
        model_info_path = os.path.join(temp_dir, 'model_info.json')
        with open(model_info_path, 'w') as f:
            json.dump(model_info, f, indent=2)

        # Upload to Neptune
        neptune_run["model/actor_critic"].upload(actor_critic_path)
        neptune_run["model/model_info"].upload(model_info_path)
        neptune_run["model/saved_at"] = datetime.now().isoformat()

        print(f"✓ Model saved to Neptune for run {config['run_id']}")

def load_model_from_neptune(neptune_run_id, my_api=None):
    """Load a trained model from Neptune"""
    import neptune
    import tempfile
    import torch

    my_api = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1ODQwZjA5OS05MDFmLTQ2MWYtYWJiMi0yMDkzYmEwNzgzMzEifQ=="
    PROJECT_NAME = "EnergyGridRL/elec-s-10-ec-lc10-1h-Dispatch"

    run = neptune.init_run(
    project=PROJECT_NAME,
    api_token=my_api,
    run=neptune_run_id,
    mode="read-only")

    with tempfile.TemporaryDirectory() as temp_dir:
        # Download model files
        model_info_path = os.path.join(temp_dir, 'model_info.json')
        actor_critic_path = os.path.join(temp_dir, 'actor_critic.pt')

        run["model/model_info"].download(model_info_path)
        run["model/actor_critic"].download(actor_critic_path)

        # Load model info
        with open(model_info_path, 'r') as f:
            model_info = json.load(f)

        # Load state dict
        state_dict = torch.load(actor_critic_path, map_location='cpu')

        run.stop()

        return {
            'state_dict': state_dict,
            'model_info': model_info,
            'config': model_info['config']
        }
def reconstruct_agent_from_saved_model(model_data, network_file_path=None):
    """Reconstruct a PPO agent from saved model data"""
    config = model_data['config']

    # Reconstruct environment
    if network_file_path is None:
        gdrive_base = '/content/drive/MyDrive/Colab_Notebooks/'
        network_file_path = os.path.join(gdrive_base, "networks_1_year_connected", config["network_file"])

    if config["env_class"] == "EnvDispatchConstr":
        env = EnvDispatchConstr(
    network_file=network_file_path,  # Full path: "/content/drive/MyDrive/Colab_Notebooks/networks_1_year_connected/elec_s_10_ec_lc1.0_1h.nc"
    constraint_penalty_factor=config["constraint_penalty_factor"]
)
    elif config["env_class"] == "EnvDispatchReplacement":
        # You'd need to recalculate or save the offset_k
        replacement_reward_offset = calculate_offset_k_initialization(
            network_file=network_file_path
        )#CHANGE THIS SO LOAD CALCULATED VALUE
        env = EnvDispatchReplacement(
            network_file=network_file_path,
            constraint_penalty_factor=config["constraint_penalty_factor"],
            offset_k=replacement_reward_offset
        )

    # Create agent (without training)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    agent = PPO_agent(
        env=env,
        run=None,  # No Neptune logging needed
        device=device,
        hidden_dimensions=config["HIDDEN_DIMENSIONS"],
        dropout=config["DROPOUT"],
        discount_factor=config["DISCOUNT_FACTOR"],
        optimizer_name=config["optimizer_name"],
        max_episodes=1,  # Not training
        print_interval=1000,
        PPO_steps=config["PPO_STEPS"],
        n_trials=config["N_TRIALS"],
        epsilon=config["EPSILON"],
        entropy_coefficient=config["ENTROPY_COEFFICIENT"],
        learning_rate=config["LEARNING_RATE"],
        batch_size=config["BATCH_SIZE"],
        seed=config["seed"]
    )

    # Load the trained weights
    if 'state_dict' in model_data:
        agent.model.load_state_dict(model_data['state_dict']) # Assuming agent.model is used
    elif 'actor_critic_state_dict' in model_data:
        agent.model.load_state_dict(model_data['actor_critic_state_dict']) # Fallback if named differently
    else:
        print("Warning: Could not find state dictionary in model data.")

    # Set to evaluation mode
    agent.model.eval() # Assuming agent.model is the combined model

    print(f"✓ Agent reconstructed from saved model (Run {config['run_id']})")
    return agent, env

def get_best_models(top_k=5, metric='mape'):
    """Get the top K models based on a performance metric"""
    tracker = SimpleSweepTracker()

    if not os.path.exists(tracker.completed_file):
        print("No completed runs found")
        return []

    df = pd.read_csv(tracker.completed_file)

    if metric not in df.columns:
        print(f"Metric '{metric}' not found. Available metrics: {df.columns.tolist()}")
        return []

    # Sort by metric (assuming lower is better for mape, higher for rewards)
    if metric in ['mape', 'constraint_violation_percentage']:
        best_runs = df.nsmallest(top_k, metric)
    else:
        best_runs = df.nlargest(top_k, metric)

    print(f"🏆 Top {top_k} models by {metric}:")
    for i, (_, row) in enumerate(best_runs.iterrows(), 1):
        print(f"{i}. Run {row['run_id']}: {metric}={row[metric]:.4f} (Neptune: {row.get('neptune_run_id', 'N/A')})")

    return best_runs.to_dict('records')

def load_best_model(metric='mape', from_neptune=True):
    """Load the single best model"""
    best_models = get_best_models(top_k=1, metric=metric)
    if not best_models:
        return None

    best_model = best_models[0]

    if from_neptune and 'neptune_run_id' in best_model:
        print(f"Loading best model from Neptune (Run {best_model['run_id']})...")
        model_data = load_model_from_neptune(best_model['neptune_run_id'])
        return reconstruct_agent_from_saved_model(model_data)
    else:
        print("No model file information found")
        return None

In [12]:
# Set up all configurations - this creates the files
tracker = setup_sweep()

✓ Created 39 configurations
✓ Saved to: /content/drive/MyDrive/Colab_Notebooks/sweep_results/all_configs.json
📊 Sweep Status:
   Total runs: 39
   Completed: 0 (0.0%)
   Remaining: 39
   Next run: 0 (Config_0_Seed_42)


In [13]:
# See what the first experiment looks like
tracker = SimpleSweepTracker()
first_config = tracker.get_next_run()
print("First run configuration:")
for key, value in first_config.items():
    print(f"  {key}: {value}")

First run configuration:
  optimizer_name: Adam
  MAX_EPISODES: 10000
  PRINT_INTERVAL: 100
  N_TRIALS: 8
  DROPOUT: 0
  network_file: elec_s_10_ec_lc1.0_1h.nc
  optimization_result_file: elec_s_10_ec_lc1.0_1h_Test_Objective.txt
  LEARNING_RATE: 0.001
  EPSILON: 0.3
  ENTROPY_COEFFICIENT: 0.1
  HIDDEN_DIMENSIONS: 64
  PPO_STEPS: 16
  BATCH_SIZE: 256
  DISCOUNT_FACTOR: 0.99
  constraint_penalty_factor: 0
  episode_length: 4
  env_class: EnvDispatchConstr
  run_id: 0
  config_name: Config_0_Seed_42
  config_idx: 0
  seed: 42


In [None]:
# Run just the first experiment
success = run_next()

🚀 Starting run 0: Config_0_Seed_42




[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/EnergyGridRL/elec-s-10-ec-lc10-1h-Dispatch/e/EL1-40




Fixed ZA0 0 PHS: set max_hours to 8.0
Fixed ZA0 5 PHS: set max_hours to 8.0
Fixed ZA0 6 hydro: corrected max_hours from 3831.6270020496813 to 6.0
=== FIXING ARTIFICIAL LINES WITH REASONABLE CAPACITY ===
Found 3 artificial lines to fix:

🔧 Fixing: lines new ZA0 4 <-> ZA2 0 AC
    Connected buses: ZA0 4 ↔ ZA2 0
    Bus demands: ZA0 4: 15945.8 MW, ZA2 0: 452.6 MW
    s_nom: 0.0 → 47837.3 MW
    s_nom_extendable: → False

🔧 Fixing: lines new ZA0 0 <-> ZA1 0 AC
    Connected buses: ZA0 0 ↔ ZA1 0
    Bus demands: ZA0 0: 3513.0 MW, ZA1 0: 1386.9 MW
    s_nom: 0.0 → 10538.9 MW
    s_nom_extendable: → False

🔧 Fixing: lines new ZA0 0 <-> ZA3 0 AC
    Connected buses: ZA0 0 ↔ ZA3 0
    Bus demands: ZA0 0: 3513.0 MW, ZA3 0: 721.1 MW
    s_nom: 0.0 → 10538.9 MW
    s_nom_extendable: → False
Exact test start date not found. Using nearest: 2013-12-01 00:00:00
Using variable episode length, max: None
Initialized Beta parameters to produce uniform-like distribution
> [0;32m/tmp/ipython-input-67627182


divide by zero encountered in scalar divide


divide by zero encountered in scalar divide



> [0;32m/tmp/ipython-input-676271826.py[0m(612)[0;36mscale_action[0;34m()[0m
[0;32m    611 [0;31m                [0;31m# Ensure feasible range exists[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 612 [0;31m                [0;32mif[0m [0mp_set_min[0m [0;34m>[0m [0mp_set_max[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    613 [0;31m                    [0;31m# If no feasible range, clip p_dispatch and recalculate[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> p_set_min > p_set_max
np.True_
ipdb> bounds['eff_dispatch']
np.float64(0.9)
ipdb> min(p_dispatch_raw, p_nom * 0.5)
np.float64(300.0)
ipdb> p_dispatch_raw * store_coeff - p_dispatch_raw / bounds['eff_dispatch']
np.float64(-664.9222373962402)
ipdb> p_set_min_from_soc = (base_term - bounds['max_net_energy_per_dt']) / store_coeff                     



divide by zero encountered in scalar divide



ipdb> store_coeff
np.float64(0.0)
ipdb> print(self.network.storage_units.efficiency_store)
StorageUnit
ZA0 0 PHS      0.866025
ZA0 5 PHS      0.866025
ZA0 6 hydro    0.000000
Name: efficiency_store, dtype: float64


In [None]:
# See what happened
check_status()

# Look at the completed run details
tracker = SimpleSweepTracker()
print("\nCompleted run details:")

if os.path.exists(tracker.completed_file):
    df = pd.read_csv(tracker.completed_file)
    if not df.empty:
        print(df.iloc[0])  # Show first completed run
    else:
        print("Completed runs file is empty.")
else:
    print("Completed runs file not found.")