In [None]:
!pip install optuna
!pip install sympy==1.12

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import random
import gymnasium as gym
from gymnasium import spaces
from collections import deque
import time

network_topology = {
    0: [1, 2], 1: [0, 3], 2: [0, 4], 3: [1, 4], 4: [2, 3]
}

class RoutingEnv(gym.Env):
    """
    A stochastic network routing environment where packet success is determined
    by target node congestion and bottleneck bandwidth.
    """
    def __init__(self, total_nodes=5, max_ttl=10, max_congestion=5, max_bandwidth=10,
                 base_link_success_rate=0.9, base_link_latency=1.0):
        super().__init__()
        self.total_nodes = total_nodes
        self.max_ttl = max_ttl
        self.max_congestion = max_congestion
        self.max_bandwidth = max_bandwidth
        self.base_link_success_rate = base_link_success_rate
        self.base_link_latency = base_link_latency
        self.network_topology = network_topology

        self.max_neighbors_sense = 3
        state_size = 2 * self.total_nodes + 3 + self.max_neighbors_sense

        self.observation_space = spaces.Box(low=0, high=1, shape=(state_size,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.total_nodes + 1)

        self.current_node = None
        self.destination_node = None
        self.ttl_remaining = None
        self.node_congestions = {}
        self.node_bandwidths = {}
        self.time_window_deadline = 0.0
        self.total_steps = 0
        self.accumulated_latency = 0.0
        self.min_path_bandwidth = float('inf')
        self.successful_hop_delays = []
        self.successful_hops_count = 0

        self.np_random, _ = gym.utils.seeding.np_random()

    def get_state_features(self, current_node, dest_node, ttl, congestions, bandwidths):
        curr_vec = np.zeros(self.total_nodes); curr_vec[current_node] = 1
        dest_vec = np.zeros(self.total_nodes); dest_vec[dest_node] = 1

        ttl_norm = ttl / self.max_ttl
        local_cong = congestions[current_node] / self.max_congestion
        local_bw = bandwidths[current_node] / self.max_bandwidth

        neighbors = self.network_topology.get(current_node, [])
        neighbor_cong_features = []

        for i in range(self.max_neighbors_sense):
            if i < len(neighbors):
                n_id = neighbors[i]
                neighbor_cong_features.append(congestions[n_id] / self.max_congestion)
            else:
                neighbor_cong_features.append(0.0)

        state_vector = np.concatenate([
            curr_vec, dest_vec, [ttl_norm],
            [local_cong], [local_bw],
            neighbor_cong_features
        ])
        return state_vector.astype(np.float32)

    def get_action_space(self, current_node_id):
        return self.network_topology.get(current_node_id, []) + ['drop']

    def get_reward(self, current_node, destination_node, action, link_successful, is_terminal=False, terminal_reason=None):
        if action == 'drop':
            return -20
        elif is_terminal:
            if terminal_reason == "Goal Reached": return 100
            if terminal_reason == "Time Window Deadline Missed": return -50
            if terminal_reason == "TTL Expired": return -10
        return -1 if link_successful else -20

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_node = self.np_random.integers(0, self.total_nodes)
        self.destination_node = self.np_random.integers(0, self.total_nodes)
        while self.destination_node == self.current_node:
             self.destination_node = self.np_random.integers(0, self.total_nodes)

        self.ttl_remaining = self.np_random.integers(max(1, self.max_ttl // 2), self.max_ttl + 1)

        for node in range(self.total_nodes):
            self.node_bandwidths[node] = self.np_random.uniform(0.7 * self.max_bandwidth, self.max_bandwidth)
            self.node_congestions[node] = self.np_random.uniform(0, self.max_congestion)

        min_time_estimate = 3 * self.base_link_latency
        self.time_window_deadline = self.np_random.uniform(1.2 * min_time_estimate, 2.0 * min_time_estimate)

        self.total_steps = 0
        self.accumulated_latency = 0.0
        self.min_path_bandwidth = float('inf')
        self.successful_hop_delays = []
        self.successful_hops_count = 0

        return self.get_state_features(
            self.current_node, self.destination_node, self.ttl_remaining,
            self.node_congestions, self.node_bandwidths
        ), {"deadline": self.time_window_deadline}

    def step(self, action_index):
        available_actions = self.get_action_space(self.current_node)
        self.total_steps += 1

        if action_index >= len(available_actions): action = 'drop'
        else: action = available_actions[action_index]

        initial_node = self.current_node
        reward, done, link_successful = 0, False, False
        next_node = self.current_node
        terminal_reason = None

        # Update global network dynamics
        for node in range(self.total_nodes):
             replenish = self.np_random.uniform(-0.1 * self.max_bandwidth, 0.5 * self.max_bandwidth)
             self.node_bandwidths[node] = max(0, min(self.max_bandwidth, self.node_bandwidths[node] + replenish))
             delta = self.np_random.uniform(-1.0, 1.0)
             self.node_congestions[node] = max(0, min(self.max_congestion, self.node_congestions[node] + delta))

        if action == 'drop':
            terminal_reason = "Dropped by Agent"
            done = True
        else:
            neighbor_node = action
            target_cong = self.node_congestions[neighbor_node]
            target_bw = self.node_bandwidths[neighbor_node]

            cong_norm = target_cong / self.max_congestion
            bw_norm = target_bw / self.max_bandwidth

            qos_penalty = (cong_norm * 0.4) + ((1 - bw_norm) * 0.2)
            effective_success_rate = max(0.05, self.base_link_success_rate - qos_penalty)

            if self.np_random.random() < effective_success_rate:
                link_successful = True
                next_node = neighbor_node
                self.successful_hops_count += 1
                self.min_path_bandwidth = min(self.min_path_bandwidth, target_bw)
                current_delay = self.base_link_latency * (1.0 + (cong_norm * 2.0))

                self.node_bandwidths[neighbor_node] = max(0, self.node_bandwidths[neighbor_node] - self.np_random.uniform(0.1, 0.5))
                self.node_congestions[neighbor_node] = min(self.max_congestion, self.node_congestions[neighbor_node] + 0.5)
            else:
                current_delay = self.base_link_latency * 2.0

            self.accumulated_latency += current_delay
            if link_successful: self.successful_hop_delays.append(current_delay)
            self.ttl_remaining -= 1

            if self.accumulated_latency > self.time_window_deadline:
                terminal_reason = "Time Window Deadline Missed"
                done = True
            elif self.ttl_remaining <= 0:
                terminal_reason = "TTL Expired"
                done = True
            elif next_node == self.destination_node:
                terminal_reason = "Goal Reached"
                done = True

            if not done:
                reward = self.get_reward(next_node, self.destination_node, action, link_successful)
                self.current_node = next_node
            else:
                reward = self.get_reward(initial_node, self.destination_node, action, link_successful, is_terminal=True, terminal_reason=terminal_reason)

        info = {}
        if done:
            info = {
                "terminal_reason": terminal_reason,
                "episode_success": (terminal_reason == "Goal Reached"),
                "total_steps": self.total_steps,
                "successful_hops": self.successful_hops_count,
                "rtt_latency": self.accumulated_latency,
                "throughput_bottleneck": self.min_path_bandwidth if terminal_reason == "Goal Reached" else 0.0,
                "throughput_proxy": (1.0 / self.accumulated_latency) if self.accumulated_latency > 0 else 0.0,
                "jitter_std_dev": np.std(self.successful_hop_delays) if len(self.successful_hop_delays) > 1 else 0.0
            }

        return self.get_state_features(self.current_node, self.destination_node, self.ttl_remaining, self.node_congestions, self.node_bandwidths), reward, done, False, info

    def render(self): pass
    def close(self): pass

In [None]:
# --- SAC AGENT AND NETWORK DEFINITIONS ---

# State size: 2 * total_nodes + 3 (TTL, Congestion, Bandwidth) -> 13
INPUT_DIMS = 13
# Action size: total_nodes + 1 (Drop action) -> 6
N_ACTIONS = 6
# Increased dimensions to match the improved network architecture
FC_DIMS = 512

class ActorNetwork(nn.Module):
    def __init__(self, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, fc1_dims=FC_DIMS, fc2_dims=FC_DIMS):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.fc3 = nn.Linear(fc2_dims, n_actions)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits

class DuelingCriticNetwork(nn.Module):
    def __init__(self, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, fc1_dims=FC_DIMS, fc2_dims=FC_DIMS):
        super(DuelingCriticNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.V = nn.Linear(fc2_dims, 1)
        self.A = nn.Linear(fc2_dims, n_actions)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        V = self.V(x)
        A = self.A(x)
        Q_values = V + (A - A.mean(dim=1, keepdim=True))
        return Q_values

class CombinedDuelingCritic(nn.Module):
    def __init__(self, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, fc1_dims=FC_DIMS, fc2_dims=FC_DIMS):
        super(CombinedDuelingCritic, self).__init__()
        self.q1 = DuelingCriticNetwork(input_dims, n_actions, fc1_dims, fc2_dims)
        self.q2 = DuelingCriticNetwork(input_dims, n_actions, fc1_dims, fc2_dims)

    def forward(self, state):
        q1_values = self.q1(state)
        q2_values = self.q2(state)
        return q1_values, q2_values

class ReplayBuffer:
    """
    Replay buffer using collections.deque for efficient fixed-capacity management.
    """
    def __init__(self, capacity):
        # Use deque for automatic fixed-size management
        self.buffer = deque(maxlen=capacity)

    def add(self, transition):
        # deque automatically handles capacity limit
        self.buffer.append(transition)

    def sample(self, batch_size):
        # We rely on SACAgent.learn() to check buffer size before calling sample
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

class SACAgent:
    def __init__(self, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, alpha=0.2, tau=0.005, gamma=0.99, lr=3e-4,
                 buffer_capacity=1_000_000, batch_size=256, fc1_dims=FC_DIMS, fc2_dims=FC_DIMS):

        self.gamma = gamma; self.tau = tau; self.lr = lr; self.batch_size = batch_size; self.n_actions = n_actions

        # The network constructors now receive fc1_dims=512 and fc2_dims=128
        self.actor = ActorNetwork(input_dims, n_actions, fc1_dims, fc2_dims)
        self.critic = CombinedDuelingCritic(input_dims, n_actions, fc1_dims, fc2_dims)
        self.target_critic = CombinedDuelingCritic(input_dims, n_actions, fc1_dims, fc2_dims)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_1_optimizer = optim.Adam(self.critic.q1.parameters(), lr=lr)
        self.critic_2_optimizer = optim.Adam(self.critic.q2.parameters(), lr=lr)

        self.update_target_networks(tau=1.0)

        self.target_entropy = -torch.log(torch.tensor(1.0 / self.n_actions))
        self.log_alpha = torch.tensor(np.log(alpha), requires_grad=True)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
        self.alpha = self.log_alpha.exp()

        self.replay_buffer = ReplayBuffer(buffer_capacity)

    def update_target_networks(self, tau=None):
        if tau is None: tau = self.tau
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

    def choose_action(self, observation, evaluate=False):
        state = torch.tensor([observation], dtype=torch.float)
        logits = self.actor(state)
        probs = F.softmax(logits, dim=-1)

        if evaluate: action = torch.argmax(probs).item()
        else:
            dist = Categorical(probs=probs)
            action = dist.sample().item()
        return action

    def learn(self):
        if len(self.replay_buffer) < self.batch_size: return

        transitions = self.replay_buffer.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*transitions)

        states = torch.tensor(np.array(states), dtype=torch.float)
        actions = torch.tensor(np.array(actions), dtype=torch.long).unsqueeze(-1)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float).unsqueeze(-1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float)
        dones = torch.tensor(np.array(dones), dtype=torch.float).unsqueeze(-1)

        # Critic updates
        with torch.no_grad():
            next_logits = self.actor(next_states)
            # Standardize calculation of policy output vectors
            next_probs = F.softmax(next_logits, dim=-1)
            next_log_probs = F.log_softmax(next_logits, dim=-1) # Log Probs for V-value

            target_q1_next, target_q2_next = self.target_critic(next_states)
            min_target_q_next = torch.min(target_q1_next, target_q2_next)

            # Calculate V(s') = sum(pi(a|s') * (Q(s', a) - alpha * log pi(a|s')))
            next_v_value = torch.sum(next_probs * (min_target_q_next - self.alpha.detach() * next_log_probs), dim=1, keepdim=True)
            target_q_value = rewards + (1 - dones) * self.gamma * next_v_value

        q1_current, q2_current = self.critic(states)
        q1_taken_action = q1_current.gather(1, actions)
        q2_taken_action = q2_current.gather(1, actions)

        critic_loss_1 = F.mse_loss(q1_taken_action, target_q_value)
        critic_loss_2 = F.mse_loss(q2_taken_action, target_q_value)
        critic_loss = critic_loss_1 + critic_loss_2

        self.critic_1_optimizer.zero_grad(); self.critic_2_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_1_optimizer.step(); self.critic_2_optimizer.step()

        # Actor updates
        logits = self.actor(states)
        probs = F.softmax(logits, dim=-1)
        log_probs = F.log_softmax(logits, dim=-1)

        with torch.no_grad():
            q1_current, q2_current = self.critic(states)
            min_q_current = torch.min(q1_current, q2_current)

        policy_loss = torch.sum(probs * (self.alpha.detach() * log_probs - min_q_current), dim=1).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # --- Alpha (entropy) updates ---
        # FIX: Correctly calculate the expected log probability (negative entropy)
        # We sum (probs * log_probs) to get the expectation E[log pi(a|s)]
        log_prob_expected = torch.sum(probs * log_probs, dim=1, keepdim=True)
        alpha_loss = -(self.log_alpha * (log_prob_expected + self.target_entropy).detach()).mean()

        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()
        self.alpha = self.log_alpha.exp()

        self.update_target_networks()
        return policy_loss.item(), critic_loss.item(), alpha_loss.item()

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.add((state, action, reward, next_state, done))

    def save_models(self, path):
        # Save combined critics
        torch.save(self.actor.state_dict(), f"{path}_actor.pth")
        torch.save(self.critic.state_dict(), f"{path}_critic.pth")
        torch.save(self.target_critic.state_dict(), f"{path}_target_critic.pth")
        # Save optimizers and alpha
        torch.save(self.actor_optimizer.state_dict(), f"{path}_actor_optimizer.pth")
        torch.save(self.critic_1_optimizer.state_dict(), f"{path}_critic1_optimizer.pth")
        torch.save(self.critic_2_optimizer.state_dict(), f"{path}_critic2_optimizer.pth")
        torch.save(self.alpha_optimizer.state_dict(), f"{path}_alpha_optimizer.pth")
        torch.save(self.log_alpha, f"{path}_log_alpha.pt")

    def load_models(self, path):
        # Load combined critics
        self.actor.load_state_dict(torch.load(f"{path}_actor.pth"))
        self.critic.load_state_dict(torch.load(f"{path}_critic.pth"))
        self.target_critic.load_state_dict(torch.load(f"{path}_target_critic.pth"))
        # Load optimizers and alpha
        self.actor_optimizer.load_state_dict(torch.load(f"{path}_actor_optimizer.pth"))
        self.critic_1_optimizer.load_state_dict(torch.load(f"{path}_critic1_optimizer.pth"))
        self.critic_2_optimizer.load_state_dict(torch.load(f"{path}_critic2_optimizer.pth"))
        self.alpha_optimizer.load_state_dict(torch.load(f"{path}_alpha_optimizer.pth"))
        self.log_alpha = torch.load(f"{path}_log_alpha.pt")
        self.alpha = self.log_alpha.exp()

In [None]:
import optuna
import torch
import numpy as np

def objective(trial):
    """
    Optuna objective function to optimize SAC agent hyperparameters.
    Trains for 1000 episodes and returns the average reward of 50 evaluation episodes.
    """
    # Hyperparameter Search Space
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    tau = trial.suggest_float("tau", 0.001, 0.01)
    alpha = trial.suggest_float("alpha", 0.01, 0.5)
    gamma = trial.suggest_float("gamma", 0.9, 0.999)
    buffer_capacity = trial.suggest_int("buffer_capacity", 10_000, 1_000_000, step=10_000)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
    fc1_dims = trial.suggest_categorical("fc1_dims", [128, 256, 512])
    fc2_dims = trial.suggest_categorical("fc2_dims", [128, 256, 512])

    env = RoutingEnv()
    input_dims = env.observation_space.shape[0]
    n_actions = env.action_space.n

    agent = SACAgent(
        input_dims=input_dims, n_actions=n_actions,
        lr=lr, tau=tau, alpha=alpha, gamma=gamma,
        buffer_capacity=buffer_capacity, batch_size=batch_size,
        fc1_dims=fc1_dims, fc2_dims=fc2_dims
    )

    # Training Phase
    num_training_episodes = 1000
    max_steps = 100

    print(f"Trial {trial.number}: Starting training with params: {trial.params}")

    for episode in range(num_training_episodes):
        state, _ = env.reset()
        done = False
        step_count = 0

        while not done and step_count < max_steps:
            action = agent.choose_action(state, evaluate=False)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.store_transition(state, action, reward, next_state, done)

            if len(agent.replay_buffer) > agent.batch_size:
                 agent.learn()

            state = next_state
            step_count += 1

    # Evaluation Phase
    num_eval_episodes = 50
    eval_rewards = []

    print(f"Trial {trial.number}: Starting evaluation...")

    for _ in range(num_eval_episodes):
        eval_state, _ = env.reset()
        eval_done = False
        eval_episode_reward = 0
        eval_step_count = 0

        while not eval_done and eval_step_count < max_steps:
            eval_action = agent.choose_action(eval_state, evaluate=True)
            eval_next_state, eval_reward, term, trunc, _ = env.step(eval_action)
            eval_done = term or trunc
            eval_state = eval_next_state
            eval_episode_reward += eval_reward
            eval_step_count += 1

        eval_rewards.append(eval_episode_reward)

    avg_eval_reward = np.mean(eval_rewards)
    print(f"Trial {trial.number} Finished. Avg Reward: {avg_eval_reward:.2f}")

    return avg_eval_reward

In [None]:
import torch.optim as optim

study = optuna.create_study(direction="maximize")

# Run the optimization for a specified number of trials.
n_trials = 50
print("Starting Optuna study...")
study.optimize(objective, n_trials=n_trials)
print("Optuna study finished.")

# Print the best trial's information.
print("\nBest trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


In [None]:
import time
import numpy as np
from collections import deque

# --- TRAINING CONFIGURATION ---
num_episodes = 10000
max_steps = 100
report_interval = 50
eval_interval = 100
num_eval_episodes = 20
best_model_path = "best_sac_routing_model"

# Initialize Environment and Agent using Optuna results
env = RoutingEnv()
params = study.best_trial.params

agent = SACAgent(
    input_dims=env.observation_space.shape[0],
    n_actions=env.action_space.n,
    lr=params['lr'],
    tau=params['tau'],
    alpha=params['alpha'],
    gamma=params['gamma'],
    buffer_capacity=params['buffer_capacity'],
    batch_size=params['batch_size'],
    fc1_dims=params['fc1_dims'],
    fc2_dims=params['fc2_dims']
)

# Logging Variables
total_steps = 0
best_avg_eval_reward = -np.inf
episode_rewards = deque(maxlen=report_interval)
training_losses = deque(maxlen=report_interval)
sac_convergence_data = {'episode': [], 'best_eval_reward': []}

print(f"Starting SAC training | Layers: {params['fc1_dims']}/{params['fc2_dims']} | Batch: {params['batch_size']}")

start_time = time.time()

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0
    episode_losses = []

    while not done and step_count < max_steps:
        action = agent.choose_action(state, evaluate=False)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        agent.store_transition(state, action, reward, next_state, done)

        if len(agent.replay_buffer) > agent.batch_size * 2:
             p_loss, c_loss, _ = agent.learn()
             episode_losses.append(p_loss + c_loss)

        state = next_state
        episode_reward += reward
        total_steps += 1
        step_count += 1

    episode_rewards.append(episode_reward)
    if episode_losses:
        training_losses.append(np.mean(episode_losses))

    # Periodic Progress Report
    if (episode + 1) % report_interval == 0:
        avg_rew = np.mean(episode_rewards)
        avg_loss = np.mean(training_losses) if training_losses else 0.0
        print(f"Ep {episode + 1}/{num_episodes} | Steps: {total_steps} | Avg Reward: {avg_rew:.2f} | Avg Loss: {avg_loss:.4f}")

    # Evaluation and Model Checkpointing
    if (episode + 1) % eval_interval == 0:
        eval_rewards = []
        eval_rtts = []

        for _ in range(num_eval_episodes):
            eval_state, _ = env.reset()
            eval_done = False
            eval_rew = 0
            while not eval_done:
                eval_action = agent.choose_action(eval_state, evaluate=True)
                eval_next, r, term, trunc, info = env.step(eval_action)
                eval_done = term or trunc
                eval_state = eval_next
                eval_rew += r
                if eval_done and info.get("rtt_latency"):
                    eval_rtts.append(info["rtt_latency"])
            eval_rewards.append(eval_rew)

        current_avg_reward = np.mean(eval_rewards)
        avg_rtt = np.mean(eval_rtts) if eval_rtts else 0

        print(f"--- Eval at Ep {episode + 1} | Avg Reward: {current_avg_reward:.2f} | Avg RTT: {avg_rtt:.2f} ---")

        if current_avg_reward > best_avg_eval_reward:
            best_avg_eval_reward = current_avg_reward
            agent.save_models(best_model_path)
            print(f"New Best Model Saved ({best_avg_eval_reward:.2f})")

        sac_convergence_data['episode'].append(episode + 1)
        sac_convergence_data['best_eval_reward'].append(best_avg_eval_reward)

print(f"\nTraining Complete | Time: {time.time() - start_time:.2f}s | Best Reward: {best_avg_eval_reward:.2f}")

# evaluate the model

In [None]:
# --- Test the trained SAC model ---

# Instantiate the environment
env = RoutingEnv()
input_dims = env.observation_space.shape[0]
n_actions = env.action_space.n

# --- Agent Initialization and Model Loading ---
try:
    # Attempt to use best_sac_params if available
    agent_test = SACAgent(input_dims=input_dims, n_actions=n_actions,
                          fc1_dims=best_sac_params['fc1_dims'],
                          fc2_dims=best_sac_params['fc2_dims'])
    print("Initialized agent for testing using best_sac_params.")
except NameError:
    # Fallback if best_sac_params is not defined
    print("best_sac_params not found. Initializing agent with default/observed FC dims (512, 128).")
    agent_test = SACAgent(input_dims=input_dims, n_actions=n_actions,
                          fc1_dims=512, fc2_dims=128)

best_model_path = "best_sac_routing_model"

try:
    agent_test.load_models(best_model_path)
    print(f"Successfully loaded SAC model from: {best_model_path}")
except FileNotFoundError:
    print(f"Error: Model file not found at {best_model_path}. Please ensure the training was completed and the model was saved.")
    # Exit or handle the error appropriately if the model cannot be loaded
    # Re-raising the error instead of calling exit() in a notebook environment
    raise FileNotFoundError(f"Model file not found at {best_model_path}")

# --- Testing Loop Setup ---

num_test_episodes = 1000 # Number of episodes to test
# Lists to store ALL episode results (NaNs will be present for RTT/Jitter on failures)
test_rewards = []
test_rtt_latency = []
test_throughput_proxy = []
test_jitter_std_dev = []
test_successful_hops_count = []
test_is_success = [] # New list to store boolean success (based on terminal reason)

# Use a large safety step limit based on environment's TTL
MAX_STEPS_LIMIT = env.max_ttl * 2 + 5

print("\nStarting SAC model testing...")

# --- Testing Loop ---

for episode in range(num_test_episodes):
    state, info = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    while not done and step_count < MAX_STEPS_LIMIT:

        action = agent_test.choose_action(state, evaluate=True)

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        state = next_state
        episode_reward += reward
        step_count += 1

    # Store overall episode reward
    test_rewards.append(episode_reward)

    # Collect additional metrics from the info dictionary
    # The info dict is only reliable when terminated=True was set in the last step
    if done:
        # FIX 1: Determine success based on the correct terminal reason
        is_success = info.get('terminal_reason') == "Goal Reached"
        test_is_success.append(is_success)

        # Store all metrics (even from failures, where they might be NaN)
        test_successful_hops_count.append(info.get("successful_hops", float('nan')))
        test_rtt_latency.append(info.get("rtt_latency", float('nan')))
        test_throughput_proxy.append(info.get("throughput_proxy", float('nan')))
        test_jitter_std_dev.append(info.get("jitter_std_dev", float('nan')))
    else:
        # If the loop ended due to MAX_STEPS_LIMIT (truncated externally), treat as failure
        test_is_success.append(False)
        test_successful_hops_count.append(float('nan'))
        test_rtt_latency.append(float('nan'))
        test_throughput_proxy.append(float('nan'))
        test_jitter_std_dev.append(float('nan'))


    # Print episode summary
    print(f"Test Episode {episode + 1}/{num_test_episodes}, Reward: {episode_reward:.2f}, Steps: {step_count}, Success: {test_is_success[-1]}")

# --- Calculate and Report Summary Metrics (Updated) ---

avg_test_reward = np.mean(test_rewards)
test_success_rate = np.mean(test_is_success) if test_is_success else 0.0

# FIX 2: Use the boolean success array to filter all relevant metrics
successful_episodes_indices = np.array(test_is_success, dtype=bool)

# Convert lists to NumPy arrays for easy filtering
test_rtt_latency_np = np.array(test_rtt_latency)
test_throughput_proxy_np = np.array(test_throughput_proxy)
test_jitter_std_dev_np = np.array(test_jitter_std_dev)
test_successful_hops_count_np = np.array(test_successful_hops_count)

# Filter metrics for successful episodes only
successful_rtt = test_rtt_latency_np[successful_episodes_indices]
successful_throughput = test_throughput_proxy_np[successful_episodes_indices]
successful_jitter = test_jitter_std_dev_np[successful_episodes_indices]
successful_hops = test_successful_hops_count_np[successful_episodes_indices]

# Calculate averages (checking if any successful episodes occurred)
avg_test_rtt_latency = np.mean(successful_rtt) if len(successful_rtt) > 0 else float('nan')
avg_test_throughput_proxy = np.mean(successful_throughput) if len(successful_throughput) > 0 else float('nan')
avg_test_jitter_std_dev = np.mean(successful_jitter) if len(successful_jitter) > 0 else float('nan')
avg_test_successful_hops_count = np.mean(successful_hops) if len(successful_hops) > 0 else float('nan')


print(f"\n--- SAC Model Testing Summary over {num_test_episodes} episodes ---")
print(f"Average Total Reward: {avg_test_reward:.2f}")
print(f"Success Rate: {test_success_rate:.2%}")

if len(successful_rtt) > 0:
    print(f"Average RTT / Latency (Successful Episodes): {avg_test_rtt_latency:.2f}")
if len(successful_hops) > 0:
    print(f"Average Successful Hops Count (Successful Episodes): {avg_test_successful_hops_count:.2f}")
if len(successful_throughput) > 0:
    print(f"Average Throughput Proxy (Successful Episodes): {avg_test_throughput_proxy:.4f}")
if len(successful_jitter) > 0:
    print(f"Average Jitter (Std Dev of Delays) (Successful Episodes): {avg_test_jitter_std_dev:.4f}")

print("SAC model testing finished.")

## Encoure RL



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random

class EncoreRLAgent:
    def __init__(self, input_dims, n_actions, lr=1e-4, gamma=0.99, epsilon_start=1.0,
                 epsilon_end=0.01, epsilon_decay=0.995, target_update=100,
                 buffer_capacity=10000, batch_size=64, fc_dims=256):

        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update = target_update
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.learn_step_counter = 0

        # Network Architectures
        self.q_network = nn.Sequential(
            nn.Linear(input_dims, fc_dims),
            nn.ReLU(),
            nn.Linear(fc_dims, fc_dims),
            nn.ReLU(),
            nn.Linear(fc_dims, n_actions)
        )

        self.target_q_network = nn.Sequential(
            nn.Linear(input_dims, fc_dims),
            nn.ReLU(),
            nn.Linear(fc_dims, fc_dims),
            nn.ReLU(),
            nn.Linear(fc_dims, n_actions)
        )

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.replay_buffer = ReplayBuffer(buffer_capacity)

        self.update_target_network()

    def update_target_network(self):
        self.target_q_network.load_state_dict(self.q_network.state_dict())

    def choose_action(self, observation):
        if random.random() > self.epsilon:
            state = torch.tensor(np.array([observation]), dtype=torch.float)
            with torch.no_grad():
                actions = self.q_network(state)
            return torch.argmax(actions).item()
        return random.randrange(self.n_actions)

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return None

        transitions = self.replay_buffer.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*transitions)

        states = torch.tensor(np.array(states), dtype=torch.float)
        actions = torch.tensor(np.array(actions), dtype=torch.long).unsqueeze(-1)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float).unsqueeze(-1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float)
        dones = torch.tensor(np.array(dones), dtype=torch.float).unsqueeze(-1)

        # Bellman Equation update logic
        current_q_values = self.q_network(states).gather(1, actions)

        with torch.no_grad():
            next_q_values = self.target_q_network(next_states).max(1, keepdim=True)[0]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        loss = F.mse_loss(current_q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.learn_step_counter += 1
        if self.learn_step_counter % self.target_update == 0:
            self.update_target_network()

        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
        return loss.item()

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.add((state, action, reward, next_state, done))

    def save_models(self, path):
        torch.save(self.q_network.state_dict(), f"{path}_q_network.pth")
        torch.save(self.target_q_network.state_dict(), f"{path}_target_q_network.pth")
        torch.save(self.optimizer.state_dict(), f"{path}_optimizer.pth")

    def load_models(self, path):
        self.q_network.load_state_dict(torch.load(f"{path}_q_network.pth"))
        self.target_q_network.load_state_dict(torch.load(f"{path}_target_q_network.pth"))
        self.optimizer.load_state_dict(torch.load(f"{path}_optimizer.pth"))

## Define encoure rl objective for optuna


In [None]:
import numpy as np

def encore_rl_objective(trial):
    """
    Optuna objective function to optimize Encore RL hyperparameters.
    Trains for 500 episodes and returns the mean reward from 50 evaluation episodes.
    """
    # Hyperparameter Search Space
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float("gamma", 0.9, 0.999)
    epsilon_decay = trial.suggest_float("epsilon_decay", 0.99, 0.9999)
    target_update = trial.suggest_int("target_update", 10, 1000)
    buffer_capacity = trial.suggest_int("buffer_capacity", 10000, 1_000_000, step=10000)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    fc_dims = trial.suggest_categorical("fc_dims", [64, 128, 256, 512])

    env = RoutingEnv()

    # Initialize Encore RL Agent
    agent = EncoreRLAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        lr=lr,
        gamma=gamma,
        epsilon_decay=epsilon_decay,
        target_update=target_update,
        buffer_capacity=buffer_capacity,
        batch_size=batch_size,
        fc_dims=fc_dims
    )

    num_training_episodes = 500
    max_steps = 100

    print(f"Trial {trial.number}: Starting Encore RL training with params: {trial.params}")

    # Training Loop
    for _ in range(num_training_episodes):
        state, _ = env.reset()
        done = False
        step_count = 0

        while not done and step_count < max_steps:
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.store_transition(state, action, reward, next_state, done)

            if len(agent.replay_buffer) > agent.batch_size:
                 agent.learn()

            state = next_state
            step_count += 1

    # Evaluation Loop
    num_eval_episodes = 50
    eval_rewards = []

    print(f"Trial {trial.number}: Starting Encore RL evaluation...")

    for _ in range(num_eval_episodes):
        eval_state, _ = env.reset()
        eval_done = False
        eval_episode_reward = 0
        eval_step_count = 0

        # Disable exploration for evaluation
        original_epsilon = agent.epsilon
        agent.epsilon = 0.0

        while not eval_done and eval_step_count < max_steps:
            eval_action = agent.choose_action(eval_state)
            eval_next, eval_reward, term, trunc, _ = env.step(eval_action)
            eval_done = term or trunc
            eval_state = eval_next
            eval_episode_reward += eval_reward
            eval_step_count += 1

        eval_rewards.append(eval_episode_reward)
        agent.epsilon = original_epsilon

    avg_eval_reward = np.mean(eval_rewards)
    print(f"Trial {trial.number} Finished. Average Reward: {avg_eval_reward:.2f}")

    return avg_eval_reward

## Run optuna study for encore rl



In [None]:
import optuna

# Create the study for Encore RL
study_encore = optuna.create_study(direction="maximize")

n_trials_encore = 50
print("Starting Optuna study for Encore RL...")
study_encore.optimize(encore_rl_objective, n_trials=n_trials_encore)
print("Optuna study for Encore RL finished.")

# Results Summary
print("\nBest trial for Encore RL:")
trial_encore = study_encore.best_trial
print(f"  Value: {trial_encore.value}")
print("  Best Hyperparameters: ")
for key, value in trial_encore.params.items():
    print(f"    {key}: {value}")

## Train Encoure RL

In [None]:
import time
import numpy as np
from collections import deque

# --- ENCORE RL TRAINING CONFIGURATION ---
num_episodes = 10000
max_steps = 100
report_interval = 50
eval_interval = 100
num_eval_episodes = 20
best_model_path = "best_encore_rl_routing_model"

env = RoutingEnv()
best_params = study_encore.best_trial.params

# Initialize Encore RL Agent
agent_encore = EncoreRLAgent(
    input_dims=env.observation_space.shape[0],
    n_actions=env.action_space.n,
    lr=best_params['lr'],
    gamma=best_params['gamma'],
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=best_params['epsilon_decay'],
    target_update=best_params['target_update'],
    buffer_capacity=best_params['buffer_capacity'],
    batch_size=best_params['batch_size'],
    fc_dims=best_params['fc_dims']
)

# Logging Variables
total_steps = 0
best_avg_eval_reward = -np.inf
episode_rewards = deque(maxlen=report_interval)
encore_convergence_data = {'episode': [], 'best_eval_reward': []}

print(f"Starting Encore RL training | Best Params: {best_params}")

start_time = time.time()

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    while not done and step_count < max_steps:
        action = agent_encore.choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        agent_encore.store_transition(state, action, reward, next_state, done)

        if len(agent_encore.replay_buffer) > agent_encore.batch_size * 2:
             agent_encore.learn()

        state = next_state
        episode_reward += reward
        total_steps += 1
        step_count += 1

    episode_rewards.append(episode_reward)

    # Periodic Progress Report
    if (episode + 1) % report_interval == 0:
        avg_rew = np.mean(episode_rewards)
        print(f"Ep {episode + 1}/{num_episodes} | Total Steps: {total_steps} | Avg Reward: {avg_rew:.2f} | Epsilon: {agent_encore.epsilon:.4f}")

    # Evaluation and Model Checkpointing
    if (episode + 1) % eval_interval == 0:
        eval_rewards = []
        eval_rtts = []

        # Greedy Evaluation
        original_epsilon = agent_encore.epsilon
        agent_encore.epsilon = 0.0

        for _ in range(num_eval_episodes):
            eval_state, _ = env.reset()
            eval_done = False
            eval_rew = 0
            while not eval_done:
                eval_action = agent_encore.choose_action(eval_state)
                eval_next, r, term, trunc, info = env.step(eval_action)
                eval_done = term or trunc
                eval_state = eval_next
                eval_rew += r
                if eval_done and info.get("rtt_latency"):
                    eval_rtts.append(info["rtt_latency"])
            eval_rewards.append(eval_rew)

        agent_encore.epsilon = original_epsilon
        current_avg_reward = np.mean(eval_rewards)
        avg_rtt = np.mean(eval_rtts) if eval_rtts else 0

        print(f"--- Eval at Ep {episode + 1} | Avg Reward: {current_avg_reward:.2f} | Avg RTT: {avg_rtt:.2f} ---")

        if current_avg_reward > best_avg_eval_reward:
            best_avg_eval_reward = current_avg_reward
            agent_encore.save_models(best_model_path)
            print(f"New Best Encore RL Model Saved ({best_avg_eval_reward:.2f})")

        encore_convergence_data['episode'].append(episode + 1)
        encore_convergence_data['best_eval_reward'].append(best_avg_eval_reward)

print(f"\nEncore RL Training Complete | Time: {time.time() - start_time:.2f}s | Best Reward: {best_avg_eval_reward:.2f}")

In [None]:
import numpy as np

# --- ENCORE RL TEST CONFIGURATION ---
num_test_episodes = 1000
best_model_path = "best_encore_rl_routing_model"
env = RoutingEnv()

# Initialize Agent for Testing
try:
    params = study_encore.best_trial.params
    agent_test = EncoreRLAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        epsilon_start=0.0,  # Greedy testing
        epsilon_end=0.0,
        fc_dims=params['fc_dims']
    )
    print("Initialized Encore RL agent using best_params.")
except NameError:
    print("best_params not found. Using default FC dims (128).")
    agent_test = EncoreRLAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        epsilon_start=0.0,
        fc_dims=128
    )

# Load Trained Weights
agent_test.load_models(best_model_path)
print(f"Loaded Encore RL model from: {best_model_path}")

# Metric Storage
metrics = {
    "rewards": [],
    "rtt": [],
    "throughput": [],
    "jitter": [],
    "hops": [],
    "success_flags": []
}

max_steps = env.max_ttl * 2 + 5

print(f"\nStarting Encore RL Testing ({num_test_episodes} episodes)...")

for episode in range(num_test_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    while not done and step_count < max_steps:
        action = agent_test.choose_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        state = next_state
        episode_reward += reward
        step_count += 1

    # Log results
    is_success = info.get('terminal_reason') == "Goal Reached" if done else False
    metrics["rewards"].append(episode_reward)
    metrics["success_flags"].append(is_success)

    # Store per-episode metrics (filtered later for success)
    metrics["hops"].append(info.get("successful_hops", np.nan))
    metrics["rtt"].append(info.get("rtt_latency", np.nan))
    metrics["throughput"].append(info.get("throughput_proxy", np.nan))
    metrics["jitter"].append(info.get("jitter_std_dev", np.nan))

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_test_episodes} | Success: {is_success}")

# --- FINAL SUMMARY REPORT ---
success_indices = np.array(metrics["success_flags"], dtype=bool)
avg_reward = np.mean(metrics["rewards"])
success_rate = np.mean(metrics["success_flags"])

print(f"\n--- Encore RL Testing Summary ---")
print(f"Average Total Reward: {avg_reward:.2f}")
print(f"Success Rate: {success_rate:.2%}")

if any(success_indices):
    print(f"Average RTT (Successful): {np.nanmean(np.array(metrics['rtt'])[success_indices]):.2f}")
    print(f"Average Hops (Successful): {np.nanmean(np.array(metrics['hops'])[success_indices]):.2f}")
    print(f"Average Throughput Proxy (Successful): {np.nanmean(np.array(metrics['throughput'])[success_indices]):.4f}")
    print(f"Average Jitter (Successful): {np.nanmean(np.array(metrics['jitter'])[success_indices]):.4f}")
else:
    print("No successful episodes recorded.")

print("Encore RL testing finished.")

#Define PPO Agent


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import torch.optim as optim

class Actor(nn.Module):
    def __init__(self, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.policy = nn.Linear(fc2_dims, n_actions)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.policy(x)

class Critic(nn.Module):
    def __init__(self, input_dims, fc1_dims=256, fc2_dims=256):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.value = nn.Linear(fc2_dims, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.value(x)

class PPOAgent:
    def __init__(self, input_dims, n_actions, lr=3e-4, gamma=0.99, clip_epsilon=0.2,
                 ppo_epochs=10, minibatch_size=64, gae_lambda=0.95, fc1_dims=256, fc2_dims=256,
                 max_grad_norm=0.5):

        self.gamma = gamma
        self.clip_epsilon = clip_epsilon
        self.ppo_epochs = ppo_epochs
        self.minibatch_size = minibatch_size
        self.gae_lambda = gae_lambda
        self.max_grad_norm = max_grad_norm

        self.actor = Actor(input_dims, n_actions, fc1_dims, fc2_dims)
        self.critic = Critic(input_dims, fc1_dims, fc2_dims)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)

        self.memory = []

    def store_transition(self, state, action, reward, next_state, done, log_prob):
        self.memory.append((state, action, reward, next_state, done, log_prob))

    def clear_memory(self):
        self.memory = []

    def choose_action(self, observation):
        state = torch.tensor(np.array([observation]), dtype=torch.float)
        with torch.no_grad():
            logits = self.actor(state)
            dist = Categorical(logits=logits)
            action = dist.sample()
        return action.item(), dist.log_prob(action)

    def learn(self):
        if not self.memory:
            return None, None

        states, actions, rewards, next_states, dones, old_log_probs = zip(*self.memory)

        states = torch.tensor(np.array(states), dtype=torch.float)
        actions = torch.tensor(np.array(actions), dtype=torch.long)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float)
        dones = torch.tensor(np.array(dones), dtype=torch.float)
        old_log_probs = torch.tensor(np.array(old_log_probs), dtype=torch.float)

        # GAE Calculation
        with torch.no_grad():
            values = self.critic(states).squeeze(-1)
            next_values = self.critic(next_states).squeeze(-1)

        advantages = torch.zeros_like(rewards)
        last_gae_lambda = 0

        for t in reversed(range(len(rewards))):
            next_val = next_values[t] if t == len(rewards) - 1 else values[t+1]
            delta = rewards[t] + self.gamma * next_val * (1 - dones[t]) - values[t]
            advantages[t] = last_gae_lambda = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * last_gae_lambda

        returns = advantages + values

        num_samples = len(states)
        indices = np.arange(num_samples)
        avg_actor_loss, avg_critic_loss = 0, 0

        for _ in range(self.ppo_epochs):
            np.random.shuffle(indices)
            for start_idx in range(0, num_samples, self.minibatch_size):
                idx = indices[start_idx:start_idx + self.minibatch_size]

                # Advantage Normalization
                batch_adv = advantages[idx]
                if len(batch_adv) > 1:
                    batch_adv = (batch_adv - batch_adv.mean()) / (batch_adv.std() + 1e-8)

                # Update Critic
                new_values = self.critic(states[idx]).squeeze(-1)
                critic_loss = F.mse_loss(new_values, returns[idx])

                # Update Actor
                dist = Categorical(logits=self.actor(states[idx]))
                new_log_probs = dist.log_prob(actions[idx])
                ratio = torch.exp(new_log_probs - old_log_probs[idx])

                surr1 = ratio * batch_adv
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_adv
                actor_loss = -torch.min(surr1, surr2).mean() - 0.01 * dist.entropy().mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
                self.actor_optimizer.step()

                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
                self.critic_optimizer.step()

                avg_actor_loss += actor_loss.item()
                avg_critic_loss += critic_loss.item()

        self.clear_memory()
        total_updates = self.ppo_epochs * (num_samples // self.minibatch_size)
        return avg_actor_loss / total_updates, avg_critic_loss / total_updates

    def save_models(self, path):
        torch.save(self.actor.state_dict(), f"{path}_actor.pth")
        torch.save(self.critic.state_dict(), f"{path}_critic.pth")

    def load_models(self, path):
        self.actor.load_state_dict(torch.load(f"{path}_actor.pth"))
        self.critic.load_state_dict(torch.load(f"{path}_critic.pth"))

#Optuna obj

In [None]:
import torch
import numpy as np

def ppo_objective(trial):
    """
    Optuna objective function to optimize PPO hyperparameters.
    Trains for 500 episodes and returns the average reward from 50 evaluation episodes.
    """
    # Hyperparameter Search Space
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float("gamma", 0.9, 0.999)
    clip_epsilon = trial.suggest_float("clip_epsilon", 0.1, 0.3)
    ppo_epochs = trial.suggest_int("ppo_epochs", 5, 20)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    gae_lambda = trial.suggest_float("gae_lambda", 0.9, 0.99)
    fc1_dims = trial.suggest_categorical("fc1_dims", [64, 128, 256, 512])
    fc2_dims = trial.suggest_categorical("fc2_dims", [64, 128, 256, 512])

    env = RoutingEnv()

    agent = PPOAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        lr=lr,
        gamma=gamma,
        clip_epsilon=clip_epsilon,
        ppo_epochs=ppo_epochs,
        minibatch_size=batch_size,
        gae_lambda=gae_lambda,
        fc1_dims=fc1_dims,
        fc2_dims=fc2_dims
    )

    num_training_episodes = 500
    max_steps = 100

    print(f"Trial {trial.number}: Starting PPO training with params: {trial.params}")

    # Training Loop
    for _ in range(num_training_episodes):
        state, _ = env.reset()
        done = False
        step_count = 0
        agent.clear_memory()

        while not done and step_count < max_steps:
            action, log_prob = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.store_transition(state, action, reward, next_state, done, log_prob)

            state = next_state
            step_count += 1

        if len(agent.memory) > 0:
             agent.learn()

    # Evaluation Loop
    num_eval_episodes = 50
    eval_rewards = []

    print(f"Trial {trial.number}: Starting PPO evaluation...")

    for _ in range(num_eval_episodes):
        eval_state, _ = env.reset()
        eval_done = False
        eval_episode_reward = 0
        eval_step_count = 0

        while not eval_done and eval_step_count < max_steps:
            # Greedy action selection
            with torch.no_grad():
                logits = agent.actor(torch.tensor(np.array([eval_state]), dtype=torch.float))
                eval_action = torch.argmax(logits).item()

            eval_next, eval_reward, term, trunc, _ = env.step(eval_action)
            eval_done = term or trunc
            eval_state = eval_next
            eval_episode_reward += eval_reward
            eval_step_count += 1

        eval_rewards.append(eval_episode_reward)

    avg_eval_reward = np.mean(eval_rewards)
    print(f"Trial {trial.number} Finished. Average Reward: {avg_eval_reward:.2f}")

    return avg_eval_reward

## Run Optuna Study for PPO

In [None]:
import optuna

# Create the study for PPO
study_ppo = optuna.create_study(direction="maximize")

n_trials_ppo = 50
print("Starting Optuna study for PPO...")
study_ppo.optimize(ppo_objective, n_trials=n_trials_ppo)
print("Optuna study for PPO finished.")

# Results Summary
print("\nBest trial for PPO:")
trial_ppo = study_ppo.best_trial
print(f"  Value: {trial_ppo.value}")
print("  Best Hyperparameters: ")
for key, value in trial_ppo.params.items():
    print(f"    {key}: {value}")

## Train PPO with Best Hyperparameters


In [None]:
import time
import numpy as np
from collections import deque

# --- PPO TRAINING CONFIGURATION ---
num_episodes = 10000
max_steps = 100
ROLLOUT_BUFFER_SIZE = 2048
report_interval = 50
eval_interval = 100
num_eval_episodes = 20
best_model_path = "best_ppo_routing_model"

env = RoutingEnv()

# Load hyperparameters from Optuna or use defaults
try:
    params = study_ppo.best_trial.params
except NameError:
    params = {
        'lr': 3e-4, 'gamma': 0.99, 'clip_epsilon': 0.2, 'ppo_epochs': 10,
        'batch_size': 64, 'gae_lambda': 0.95, 'fc1_dims': 256, 'fc2_dims': 256
    }

agent_ppo = PPOAgent(
    input_dims=env.observation_space.shape[0],
    n_actions=env.action_space.n,
    lr=params['lr'],
    gamma=params['gamma'],
    clip_epsilon=params['clip_epsilon'],
    ppo_epochs=params['ppo_epochs'],
    minibatch_size=params['batch_size'],
    gae_lambda=params['gae_lambda'],
    fc1_dims=params['fc1_dims'],
    fc2_dims=params['fc2_dims']
)

# Tracking Variables
total_steps = 0
best_avg_eval_reward = -np.inf
episode_rewards = deque(maxlen=report_interval)
actor_losses, critic_losses = [], []

print(f"Starting PPO training | Buffer Size: {ROLLOUT_BUFFER_SIZE} | Batch: {params['batch_size']}")

start_time = time.time()

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    while not done and step_count < max_steps:
        action, log_prob = agent_ppo.choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        agent_ppo.store_transition(state, action, reward, next_state, done, log_prob)

        state = next_state
        episode_reward += reward
        total_steps += 1
        step_count += 1

        # Update policy only after the rollout buffer is full
        if len(agent_ppo.memory) >= ROLLOUT_BUFFER_SIZE:
            a_loss, c_loss = agent_ppo.learn()
            actor_losses.append(a_loss)
            critic_losses.append(c_loss)

    episode_rewards.append(episode_reward)

    # Periodic Progress Report
    if (episode + 1) % report_interval == 0:
        avg_rew = np.mean(episode_rewards)
        curr_al = actor_losses[-1] if actor_losses else 0.0
        print(f"Ep {episode+1}/{num_episodes} | Steps: {total_steps} | Avg Reward: {avg_rew:.2f} | Actor Loss: {curr_al:.4f}")

    # Evaluation and Model Checkpointing
    if (episode + 1) % eval_interval == 0:
        eval_rewards = []
        for _ in range(num_eval_episodes):
            eval_state, _ = env.reset()
            eval_done, eval_rew = False, 0
            while not eval_done:
                with torch.no_grad():
                    logits = agent_ppo.actor(torch.tensor(np.array([eval_state]), dtype=torch.float))
                    eval_action = torch.argmax(logits).item()
                eval_next, r, term, trunc, _ = env.step(eval_action)
                eval_done = term or trunc
                eval_state = eval_next
                eval_rew += r
            eval_rewards.append(eval_rew)

        current_avg_reward = np.mean(eval_rewards)
        print(f"--- Eval at Ep {episode + 1} | Avg Reward: {current_avg_reward:.2f} ---")

        if current_avg_reward > best_avg_eval_reward:
            best_avg_eval_reward = current_avg_reward
            agent_ppo.save_models(best_model_path)
            print(f"New Best PPO Model Saved ({best_avg_eval_reward:.2f})")

print(f"\nPPO Training Complete | Time: {time.time() - start_time:.2f}s")

## Evaluate PPO


In [None]:
import torch
import numpy as np

# --- PPO TEST CONFIGURATION ---
num_test_episodes = 1000
best_model_path = "best_ppo_routing_model"
env = RoutingEnv()

# Initialize Agent for Testing
try:
    params = study_ppo.best_trial.params
    agent_test = PPOAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        fc1_dims=params['fc1_dims'],
        fc2_dims=params['fc2_dims']
    )
    print("Initialized PPO agent using best_params.")
except NameError:
    print("best_params not found. Using default FC dims (512, 128).")
    agent_test = PPOAgent(
        input_dims=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        fc1_dims=512, fc2_dims=128
    )

# Load Trained Weights
agent_test.load_models(best_model_path)
print(f"Loaded PPO model from: {best_model_path}")

# Metric Storage
metrics = {
    "rewards": [],
    "rtt": [],
    "throughput": [],
    "jitter": [],
    "hops": [],
    "success_flags": []
}

max_steps = env.max_ttl * 2 + 5

print(f"\nStarting PPO Testing ({num_test_episodes} episodes)...")

for episode in range(num_test_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    while not done and step_count < max_steps:
        # Greedy inference
        with torch.no_grad():
            logits = agent_test.actor(torch.tensor(np.array([state]), dtype=torch.float))
            action = torch.argmax(logits).item()

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        state = next_state
        episode_reward += reward
        step_count += 1

    # Log results
    is_success = info.get('terminal_reason') == "Goal Reached" if done else False
    metrics["rewards"].append(episode_reward)
    metrics["success_flags"].append(is_success)

    # Store per-episode metrics (will be filtered for success in summary)
    metrics["hops"].append(info.get("successful_hops", np.nan))
    metrics["rtt"].append(info.get("rtt_latency", np.nan))
    metrics["throughput"].append(info.get("throughput_proxy", np.nan))
    metrics["jitter"].append(info.get("jitter_std_dev", np.nan))

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_test_episodes} | Success: {is_success}")

# --- FINAL SUMMARY REPORT ---
success_indices = np.array(metrics["success_flags"], dtype=bool)
avg_reward = np.mean(metrics["rewards"])
success_rate = np.mean(metrics["success_flags"])

print(f"\n--- PPO Model Testing Summary ---")
print(f"Average Total Reward: {avg_reward:.2f}")
print(f"Success Rate: {success_rate:.2%}")

if any(success_indices):
    print(f"Average RTT (Successful): {np.nanmean(np.array(metrics['rtt'])[success_indices]):.2f}")
    print(f"Average Hops (Successful): {np.nanmean(np.array(metrics['hops'])[success_indices]):.2f}")
    print(f"Average Throughput Proxy (Successful): {np.nanmean(np.array(metrics['throughput'])[success_indices]):.4f}")
    print(f"Average Jitter (Successful): {np.nanmean(np.array(metrics['jitter'])[success_indices]):.4f}")
else:
    print("No successful episodes recorded.")

print("PPO model testing finished.")

In [None]:
import matplotlib.pyplot as plt

# --- CONVERGENCE VISUALIZATION ---

plt.figure(figsize=(10, 6))

# Plot SAC Data
if sac_convergence_data.get('episode'):
    plt.plot(sac_convergence_data['episode'],
             sac_convergence_data['best_eval_reward'],
             label='SAC', linewidth=2)

# Plot Encore RL Data
if encore_convergence_data.get('episode'):
    plt.plot(encore_convergence_data['episode'],
             encore_convergence_data['best_eval_reward'],
             label='Encore RL', linewidth=2)

# Plot PPO Data
if ppo_convergence_data.get('episode'):
    plt.plot(ppo_convergence_data['episode'],
             ppo_convergence_data['best_eval_reward'],
             label='PPO', linewidth=2)

# Plot Styling
plt.xlabel('Episode', fontsize=12)
plt.ylabel('Best Average Evaluation Reward', fontsize=12)
plt.title('Agent Convergence Comparison', fontsize=14)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

# GPSR

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# --- GPSR BASELINE SETUP ---

node_coordinates = {
    0: (0, 50),    # Source
    1: (25, 75),   # Top Path
    2: (25, 25),   # Bottom Path
    3: (50, 50),   # Middle Hop
    4: (100, 50)   # Destination
}

def gpsr_choose_action(current_node, destination_node, topology, coords):
    """
    Standard GPSR Greedy Forwarding: Selects the neighbor strictly closest
    to the destination coordinates.
    """
    if current_node == destination_node or destination_node not in coords:
        return 'drop'

    dest_pos = coords[destination_node]

    def get_dist(n_id):
        curr_pos = coords[n_id]
        return np.sqrt((curr_pos[0] - dest_pos[0])**2 + (curr_pos[1] - dest_pos[1])**2)

    my_dist = get_dist(current_node)
    best_neighbor = 'drop'
    min_dist = my_dist

    for neighbor in topology.get(current_node, []):
        d = get_dist(neighbor)
        if d < min_dist:
            min_dist = d
            best_neighbor = neighbor

    return best_neighbor

# --- GPSR TESTING LOOP ---

env = RoutingEnv()
num_test_episodes = 1000
max_steps = env.max_ttl * 2 + 5

metrics = {
    "rewards": [],
    "success_flags": [],
    "rtt": [],
    "hops": []
}

print(f"Starting GPSR Testing ({num_test_episodes} episodes)...")

for episode in range(num_test_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    step_count = 0

    # Derive IDs from One-Hot state vector
    current_node = np.argmax(state[:env.total_nodes])
    destination_node = np.argmax(state[env.total_nodes:2*env.total_nodes])

    while not done and step_count < max_steps:
        # GPSR Decision
        chosen_node = gpsr_choose_action(current_node, destination_node, env.network_topology, node_coordinates)

        # Map Node ID to Env Action Index
        actions = env.get_action_space(current_node)
        action_idx = actions.index(chosen_node) if chosen_node in actions else len(actions)

        next_state, reward, terminated, truncated, info = env.step(action_idx)
        done = terminated or truncated

        if info.get('link_successful'):
             current_node = np.argmax(next_state[:env.total_nodes])

        episode_reward += reward
        step_count += 1

    # Store results
    is_success = info.get('terminal_reason') == "Goal Reached" if done else False
    metrics["rewards"].append(episode_reward)
    metrics["success_flags"].append(is_success)
    metrics["rtt"].append(info.get("rtt_latency", np.nan))
    metrics["hops"].append(info.get("successful_hops", np.nan))

# --- SUMMARY REPORT ---
success_indices = np.array(metrics["success_flags"], dtype=bool)
avg_reward = np.mean(metrics["rewards"])
success_rate = np.mean(metrics["success_flags"])

print(f"\n--- GPSR Baseline Summary ---")
print(f"Average Reward: {avg_reward:.2f}")
print(f"Success Rate: {success_rate:.2%}")

if any(success_indices):
    print(f"Avg RTT (Successful): {np.nanmean(np.array(metrics['rtt'])[success_indices]):.2f}")
    print(f"Avg Hops (Successful): {np.nanmean(np.array(metrics['hops'])[success_indices]):.2f}")

print("GPSR testing finished.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def aggregate_metrics(agent_name, metrics_dict):
    """
    Formats raw agent metrics into a DataFrame-friendly list.
    Filters RTT and Throughput for successful episodes only.
    """
    data = []
    success_flags = np.array(metrics_dict.get('success_flags', []), dtype=bool)

    # 1. RTT and Throughput (Successful Episodes Only)
    if 'rtt' in metrics_dict:
        rtts = np.array(metrics_dict['rtt'])[success_flags]
        data.extend([{'Agent': agent_name, 'Metric': 'RTT / Latency', 'Value': v} for v in rtts if not np.isnan(v)])

    if 'throughput' in metrics_dict:
        tps = np.array(metrics_dict['throughput'])[success_flags]
        data.extend([{'Agent': agent_name, 'Metric': 'Throughput', 'Value': v} for v in tps if not np.isnan(v)])

    # 2. Success Rate (Calculated as a single percentage point for the bar chart)
    if len(success_flags) > 0:
        rate = np.mean(success_flags) * 100
        data.append({'Agent': agent_name, 'Metric': 'Success Rate', 'Value': rate})

    # 3. Total Rewards (All Episodes)
    if 'rewards' in metrics_dict:
        data.extend([{'Agent': agent_name, 'Metric': 'Total Reward', 'Value': v} for v in metrics_dict['rewards']])

    return data

# --- Data Collection ---
all_plot_data = []

# Aggregate data for all agents (assuming metrics dictionaries are available)
if 'sac_metrics' in locals():
    all_plot_data.extend(aggregate_metrics('SAC', sac_metrics))

if 'encore_rl_metrics' in locals():
    all_plot_data.extend(aggregate_metrics('Encore RL', encore_rl_metrics))

if 'ppo_metrics' in locals():
    all_plot_data.extend(aggregate_metrics('PPO', ppo_metrics))

if 'gpsr_metrics' in locals():
    all_plot_data.extend(aggregate_metrics('GPSR', gpsr_metrics))

df_plot = pd.DataFrame(all_plot_data)

# --- Plotting ---
metrics_to_show = ['Throughput', 'RTT / Latency', 'Success Rate', 'Total Reward']
fig, axes = plt.subplots(1, 4, figsize=(20, 6))

for i, metric in enumerate(metrics_to_show):
    subset = df_plot[df_plot['Metric'] == metric]

    if metric == 'Success Rate':
        # Bar plot for categorical comparison
        sns.barplot(x='Agent', y='Value', data=subset, ax=axes[i], palette='viridis')
        axes[i].set_ylabel('Success Rate (%)')
        axes[i].set_ylim(0, 100)
    else:
        # Violin plot for distribution comparison
        sns.violinplot(x='Agent', y='Value', data=subset, ax=axes[i], inner='box', palette='viridis')
        axes[i].set_ylabel(f'{metric} Distribution')

    axes[i].set_title(f'Agent {metric}')
    axes[i].set_xlabel('Agent')

plt.tight_layout()
plt.show()