In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import time

if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

# Hyperparameters
learning_rate = 1e-3
gamma = 0.99
episodes = 1000
hidden_size = 128

# Environment
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Actor Network
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, state):
        return self.fc(state)

# Critic Network
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, state):
        return self.fc(state)

# Initialize networks and optimizers
actor = Actor()
critic = Critic()
actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate)

# Training loop
start = time.time()
for episode in range(episodes):
    state = env.reset()
    state = torch.FloatTensor(state)
    log_probs = []
    values = []
    rewards = []
    done = False
    total_reward = 0

    while not done:
        # Get action from actor
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        # Record log prob and value
        log_prob = dist.log_prob(action)
        value = critic(state)

        # Take action
        next_state, reward, done, _ = env.step(action.item())
        next_state = torch.FloatTensor(next_state)

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(reward)
        total_reward += reward

        state = next_state

    # Compute returns and advantages
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)

    returns = torch.FloatTensor(returns)
    values = torch.cat(values).squeeze()
    log_probs = torch.stack(log_probs)
    advantage = returns - values.detach()

    # Actor loss (policy gradient)
    actor_loss = -(log_probs * advantage).mean()

    # Critic loss (value regression)
    critic_loss = nn.functional.mse_loss(values, returns)

    # Update actor
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    # Update critic
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    '''if (episode + 1) % 10 == 0:
        print(f"Episode {episode+1}, Total Reward: {total_reward}")'''

env.close()
print("Time taken:",time.time()-start)

  deprecation(
  deprecation(


Time taken: 326.27266669273376


In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Hyperparameters
learning_rate = 1e-3
gamma = 0.99
constraint_threshold = 2.0
lambda_lr = 1e-2
episodes = 1000
hidden_size = 128

# Environment
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Networks
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, state):
        return self.model(state)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, state):
        return self.model(state)

# Initialize
actor = Actor()
critic = Critic()
actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate)

# Lagrange multiplier (dual variable for constraint)
lambda_c = torch.tensor(0.0, requires_grad=True)
lambda_optimizer = optim.Adam([lambda_c], lr=lambda_lr)

# Training
for episode in range(episodes):
    state = env.reset()
    state = torch.FloatTensor(state)
    log_probs, values, rewards, costs = [], [], [], []
    done = False
    total_reward, total_cost = 0, 0

    while not done:
        # Select action
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        # Step in env
        next_state, reward, done, _ = env.step(action.item())
        next_state = torch.FloatTensor(next_state)

        # Constraint: penalize deviation of x from 0
        x = state[0].item()
        cost = abs(x)

        # Store data
        log_probs.append(dist.log_prob(action))
        values.append(critic(state))
        rewards.append(reward)
        costs.append(cost)
        total_reward += reward
        total_cost += cost

        state = next_state

    # Compute returns and costs
    returns, G, C = [], 0, 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    cost_returns = []
    for c in reversed(costs):
        C = c + gamma * C
        cost_returns.insert(0, C)

    returns = torch.FloatTensor(returns)
    cost_returns = torch.FloatTensor(cost_returns)
    values = torch.cat(values).squeeze()
    log_probs = torch.stack(log_probs)

    # Compute advantages
    advantage = returns - values.detach()
    cost_advantage = cost_returns - cost_returns.mean()  # baseline trick

    # Policy loss (Lagrangian)
    actor_loss = -(log_probs * (advantage - lambda_c.detach() * cost_advantage)).mean()

    # Critic loss
    critic_loss = nn.functional.mse_loss(values, returns)

    # Optimize actor and critic
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    # Update Lagrangian multiplier (gradient ascent on constraint violation)
    constraint_violation = total_cost - constraint_threshold
    lambda_loss = -lambda_c * constraint_violation

    lambda_optimizer.zero_grad()
    lambda_loss.backward()
    lambda_optimizer.step()

    # Ensure lambda is non-negative
    lambda_c.data.clamp_(0)
    rew_store =
    # Logging
    if (episode + 1) % 10 == 0:
        print(f"Episode {episode+1}, Reward: {total_reward:.1f}, Cost: {total_cost:.2f}, Lambda: {lambda_c.item():.3f}")

env.close()

  deprecation(
  deprecation(


Episode 10, Reward: 16.0, Cost: 1.77, Lambda: 0.000
Episode 20, Reward: 15.0, Cost: 1.37, Lambda: 0.000
Episode 30, Reward: 16.0, Cost: 1.35, Lambda: 0.000
Episode 40, Reward: 11.0, Cost: 0.52, Lambda: 0.000
Episode 50, Reward: 51.0, Cost: 3.56, Lambda: 0.000
Episode 60, Reward: 19.0, Cost: 0.35, Lambda: 0.005
Episode 70, Reward: 23.0, Cost: 1.33, Lambda: 0.009
Episode 80, Reward: 30.0, Cost: 0.57, Lambda: 0.000
Episode 90, Reward: 25.0, Cost: 0.71, Lambda: 0.001
Episode 100, Reward: 32.0, Cost: 0.53, Lambda: 0.000
Episode 110, Reward: 18.0, Cost: 0.54, Lambda: 0.015
Episode 120, Reward: 91.0, Cost: 10.48, Lambda: 0.008
Episode 130, Reward: 63.0, Cost: 5.05, Lambda: 0.059
Episode 140, Reward: 23.0, Cost: 0.40, Lambda: 0.120
Episode 150, Reward: 31.0, Cost: 1.54, Lambda: 0.172
Episode 160, Reward: 52.0, Cost: 2.51, Lambda: 0.220
Episode 170, Reward: 72.0, Cost: 7.36, Lambda: 0.286
Episode 180, Reward: 61.0, Cost: 3.07, Lambda: 0.345
Episode 190, Reward: 44.0, Cost: 1.07, Lambda: 0.397
E

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import pandas as pd
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_
# Hyperparameters
learning_rate = 1e-3
gamma = 0.99
lambda_fixed = 50 # fixed Lagrange multiplier
b = 100            # constraint buffer
episodes = 1000
hidden_size = 128

# Env setup
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Actor Network
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, state):
        return self.model(state)

# Critic for reward
class CriticReward(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, state):
        return self.model(state)

# Critic for constraint
class CriticConstraint(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, state):
        return self.model(state)

# Initialize networks
actor = Actor()
critic_r = CriticReward()
critic_c = CriticConstraint()

actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
critic_r_optimizer = optim.Adam(critic_r.parameters(), lr=learning_rate)
critic_c_optimizer = optim.Adam(critic_c.parameters(), lr=learning_rate)
rew_vect = []
cost_vect = []
# Training loop
for episode in range(episodes):
    state = env.reset()
    state = torch.FloatTensor(state)
    log_probs = []
    rewards, reward_values, constraint_values = [], [], []
    constraint_costs = []
    total_reward, total_cost = 0, 0
    done = False

    while not done:
        # Actor chooses action
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        next_state, reward, done, _ = env.step(action.item())
        next_state = torch.FloatTensor(next_state)

        x_pos = state[0].item()  # cart x position
        cost = abs(x_pos)

        # Save values
        rewards.append(reward)
        constraint_costs.append(cost)
        log_probs.append(dist.log_prob(action))
        reward_values.append(critic_r(state))
        constraint_values.append(critic_c(state))

        total_reward += reward
        total_cost += cost

        state = next_state

    # Compute discounted returns
    def discounted_returns(rewards_list):
        returns, G = [], 0
        for r in reversed(rewards_list):
            G = r + gamma * G
            returns.insert(0, G)
        return torch.FloatTensor(returns)

    returns_r = discounted_returns(rewards)
    returns_c = discounted_returns(constraint_costs)

    values_r = torch.cat(reward_values).squeeze()
    values_c = torch.cat(constraint_values).squeeze()
    log_probs = torch.stack(log_probs)

    # Compute advantages
    advantage_r = returns_r - values_r.detach()
    advantage_c = returns_c - values_c.detach()

    # Choose advantage per time step using rule
    chosen_advantages = []
    for ar, ac, vr, vc in zip(advantage_r, advantage_c, values_r, values_c):
        if (vr / lambda_fixed) > (vc - b):
            chosen_advantages.append(ar)
        else:
            chosen_advantages.append(-ac)  # negative constraint advantage to penalize
    chosen_advantages = torch.stack(chosen_advantages)

    # Policy loss
    actor_loss = -(log_probs * chosen_advantages).mean()

    # Value losses
    critic_r_loss = nn.functional.mse_loss(values_r, returns_r)
    critic_c_loss = nn.functional.mse_loss(values_c, returns_c)

    # Update actor
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

    # Update critics
    critic_r_optimizer.zero_grad()
    critic_r_loss.backward()
    critic_r_optimizer.step()

    critic_c_optimizer.zero_grad()
    critic_c_loss.backward()
    critic_c_optimizer.step()
    rew_vect.append(total_reward)
    cost_vect.append(total_cost)

    # Logging
    if (episode + 1) % 10 == 0:
        print(f"Ep {episode+1} | Reward: {total_reward:.1f} | Cost: {total_cost:.2f} | Actor Loss: {actor_loss.item():.3f}")

env.close()
datafr = {'vf': rew_vect, 'cost': cost_vect}
df = pd.DataFrame(datafr)
df.to_excel('tvf_and_tcf_data.xlsx')

  deprecation(
  deprecation(


Ep 10 | Reward: 38.0 | Cost: 2.64 | Actor Loss: 11.858
Ep 20 | Reward: 18.0 | Cost: 1.01 | Actor Loss: 5.943
Ep 30 | Reward: 8.0 | Cost: 0.20 | Actor Loss: 2.347
Ep 40 | Reward: 20.0 | Cost: 1.73 | Actor Loss: 5.790
Ep 50 | Reward: 14.0 | Cost: 0.79 | Actor Loss: 4.955
Ep 60 | Reward: 20.0 | Cost: 0.51 | Actor Loss: 5.691
Ep 70 | Reward: 12.0 | Cost: 0.97 | Actor Loss: 2.472
Ep 80 | Reward: 19.0 | Cost: 0.56 | Actor Loss: 5.011
Ep 90 | Reward: 11.0 | Cost: 0.46 | Actor Loss: 1.347
Ep 100 | Reward: 36.0 | Cost: 2.58 | Actor Loss: 9.630
Ep 110 | Reward: 16.0 | Cost: 0.66 | Actor Loss: 2.977
Ep 120 | Reward: 69.0 | Cost: 5.01 | Actor Loss: 17.977
Ep 130 | Reward: 12.0 | Cost: 0.44 | Actor Loss: 1.344
Ep 140 | Reward: 14.0 | Cost: 0.71 | Actor Loss: 1.619
Ep 150 | Reward: 13.0 | Cost: 0.88 | Actor Loss: 3.067
Ep 160 | Reward: 21.0 | Cost: 0.84 | Actor Loss: 4.659
Ep 170 | Reward: 43.0 | Cost: 1.42 | Actor Loss: 9.352
Ep 180 | Reward: 85.0 | Cost: 12.17 | Actor Loss: 17.692
Ep 190 | Reward:

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import time
import pandas as pd
from copy import deepcopy

# Compatibility fix for numpy
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

# ========== Hyperparameters ==========
gamma = 0.99
hidden_dim = 256
learning_rate = 1e-3
episodes = 1000
lambda_fixed = 20  # Lagrange multiplier
b = 200.0          # cost threshold buffer
perturb_eps = 1    # Uniform noise for state perturbation

# ========== Environment ==========
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# ========== Neural Networks ==========

class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, state):
        return self.model(state)

class ValueCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, state):
        return self.model(state)

# ========== Networks & Optimizers ==========
actor = Actor()
reward_critic = ValueCritic()
cost_critic = ValueCritic()

actor_optim = optim.Adam(actor.parameters(), lr=learning_rate)
reward_optim = optim.Adam(reward_critic.parameters(), lr=learning_rate)
cost_optim = optim.Adam(cost_critic.parameters(), lr=learning_rate)

# ========== Utility Functions ==========

def add_uniform_noise(state, eps=0.05):
    """Uniform perturbation across each dimension of state."""
    noise = np.random.uniform(0, eps, size=state.shape)
    return state + noise

def discount(values, gamma=0.99):
    result = []
    G = 0
    for v in reversed(values):
        G = v + gamma * G
        result.insert(0, G)
    return torch.FloatTensor(result)

# ========== Training Loop ==========

dataF = {'cost': [], 'reward': []}
last_50_actor_params = []  # To store actor weights for averaging

for ep in range(episodes):
    state = env.reset()
    state = add_uniform_noise(np.array(state), perturb_eps)
    state = torch.FloatTensor(state)

    log_probs = []
    rewards = []
    costs = []
    reward_values = []
    cost_values = []

    total_reward = 0
    total_cost = 0
    done = False

    while not done:
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        next_state, reward, done, _ = env.step(action.item())

        next_state = add_uniform_noise(np.array(next_state), perturb_eps)
        next_state = torch.FloatTensor(next_state)

        cost = abs(state[0].item())  # distance-based cost

        # Save transitions
        log_probs.append(dist.log_prob(action))
        rewards.append(reward)
        costs.append(cost)
        reward_values.append(reward_critic(state))
        cost_values.append(cost_critic(state))

        total_reward += reward
        total_cost += cost
        state = next_state

    # Discounted returns
    reward_returns = discount(rewards, gamma)
    cost_returns = discount(costs, gamma)

    reward_values = torch.cat(reward_values).squeeze()
    cost_values = torch.cat(cost_values).squeeze()
    log_probs = torch.stack(log_probs)

    adv_r = reward_returns - reward_values.detach()
    adv_c = cost_returns - cost_values.detach()

    chosen_adv = []
    for vr, vc, ar, ac in zip(reward_returns, cost_returns, adv_r, adv_c):
        if vr.item() > lambda_fixed * (vc.item() - b):
            chosen_adv.append(ar)
        else:
            chosen_adv.append(-ac)  # penalize constraint
    chosen_adv = torch.stack(chosen_adv)

    # ===== Losses =====
    actor_loss = -(log_probs * chosen_adv).mean()
    reward_loss = nn.functional.mse_loss(reward_values, reward_returns)
    cost_loss = nn.functional.mse_loss(cost_values, cost_returns)

    # ===== Backprop =====
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    reward_optim.zero_grad()
    reward_loss.backward()
    reward_optim.step()

    cost_optim.zero_grad()
    cost_loss.backward()
    cost_optim.step()

    dataF['cost'].append(total_cost)
    dataF['reward'].append(total_reward)

    # === Store weights for last 50 actor policies ===
    if len(last_50_actor_params) >= 50:
        last_50_actor_params.pop(0)
    last_50_actor_params.append(deepcopy(actor.state_dict()))

    # Print progress
    if (ep + 1) % 50 == 0:
        print(f"Ep {ep+1} | Reward: {total_reward:.1f} | Cost: {total_cost:.2f} | Actor Loss: {actor_loss.item():.3f}")

# ========== After Training ==========

# Save data
env.close()
df = pd.DataFrame(dataF)
df.to_excel('tvf_and_tcf_data_with_uncertainity.xlsx')

# Save final models
torch.save(actor.state_dict(), 'actor.pth')
torch.save(reward_critic.state_dict(), 'reward_critic.pth')
torch.save(cost_critic.state_dict(), 'cost_critic.pth')

# === Average Last 50 Actor Policies ===
avg_actor_state_dict = deepcopy(last_50_actor_params[0])
for key in avg_actor_state_dict.keys():
    for i in range(1, len(last_50_actor_params)):
        avg_actor_state_dict[key] += last_50_actor_params[i][key]
    avg_actor_state_dict[key] /= len(last_50_actor_params)

avg_actor = Actor()
avg_actor.load_state_dict(avg_actor_state_dict)
torch.save(avg_actor.state_dict(), 'actor_avg_last50.pth')

print("Training complete. Models saved.")

  deprecation(
  deprecation(


Ep 50 | Reward: 15.0 | Cost: 5.35 | Actor Loss: 2.542
Ep 100 | Reward: 38.0 | Cost: 16.01 | Actor Loss: 6.159
Ep 150 | Reward: 58.0 | Cost: 23.99 | Actor Loss: 6.403
Ep 200 | Reward: 59.0 | Cost: 31.69 | Actor Loss: 2.326
Ep 250 | Reward: 89.0 | Cost: 30.39 | Actor Loss: 8.328
Ep 300 | Reward: 62.0 | Cost: 30.69 | Actor Loss: 0.165
Ep 350 | Reward: 132.0 | Cost: 55.64 | Actor Loss: 5.368
Ep 400 | Reward: 207.0 | Cost: 155.55 | Actor Loss: 10.301
Ep 450 | Reward: 172.0 | Cost: 178.40 | Actor Loss: 2.758
Ep 500 | Reward: 122.0 | Cost: 87.71 | Actor Loss: 2.368
Ep 550 | Reward: 157.0 | Cost: 181.60 | Actor Loss: -2.114
Ep 600 | Reward: 198.0 | Cost: 204.50 | Actor Loss: 8.053
Ep 650 | Reward: 149.0 | Cost: 106.70 | Actor Loss: 2.331
Ep 700 | Reward: 362.0 | Cost: 184.20 | Actor Loss: 9.488
Ep 750 | Reward: 290.0 | Cost: 154.38 | Actor Loss: 6.947
Ep 800 | Reward: 393.0 | Cost: 214.97 | Actor Loss: 6.375
Ep 850 | Reward: 191.0 | Cost: 92.51 | Actor Loss: 2.864
Ep 900 | Reward: 146.0 | Cost

In [None]:
######  Vanilla model
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import pandas as pd
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_
# ======================
# 🧠 Hyperparameters
# ======================
gamma = 0.99
hidden_dim = 128
lr = 1e-3
episodes = 1000
constraint_threshold = 100 # b in constraint C(s) ≤ b
dual_lr = 5e-3              # learning rate for λ
data_van = {'cost': [], 'reward': []}
# ======================
# 🧠 Environment
# ======================
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# ======================
# 🧠 Neural Networks
# ======================
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.policy = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, state):
        return self.policy(state)

class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.value = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, state):
        return self.value(state)

# ======================
# 📦 Initialize
# ======================
actor = Actor()
value_critic = Critic()       # for reward
cost_critic = Critic()        # for constraint

opt_actor = optim.Adam(actor.parameters(), lr=lr)
opt_value = optim.Adam(value_critic.parameters(), lr=lr)
opt_cost = optim.Adam(cost_critic.parameters(), lr=lr)

lambda_dual = torch.tensor(10.0, requires_grad=False)  # dual variable (Lagrange multiplier)

# ======================
# 🚀 Training Loop
# ======================
start_time = time.time()
for ep in range(episodes):
    state = env.reset()
    state = torch.FloatTensor(state)

    log_probs, rewards, costs = [], [], []
    values, cost_values = [], []

    done = False
    total_reward = 0.0
    total_cost = 0.0

    while not done:
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        next_state, reward, done, _ = env.step(action.item())
        next_state = torch.FloatTensor(next_state)

        # Constraint cost: cart's distance from center
        cost = abs(state[0].item())

        # Store
        log_probs.append(dist.log_prob(action))
        rewards.append(reward)
        costs.append(cost)
        values.append(value_critic(state))
        cost_values.append(cost_critic(state))

        total_reward += reward
        total_cost += cost

        state = next_state

    # ======================
    # 🎯 Discounted Returns
    # ======================
    def discounted(x):
        ret, g = [], 0
        for r in reversed(x):
            g = r + gamma * g
            ret.insert(0, g)
        return torch.FloatTensor(ret)

    R = discounted(rewards)
    C = discounted(costs)
    V = torch.cat(values).squeeze()
    CV = torch.cat(cost_values).squeeze()
    log_probs = torch.stack(log_probs)

    # ======================
    # 🔁 Advantages
    # ======================
    A_r = R - V.detach()
    A_c = C - CV.detach()

    # ======================
    # 🎯 Policy Loss (Primal-Dual)
    # ======================
    actor_loss = -(log_probs * (A_r - lambda_dual * (A_c-b))).mean()

    # ======================
    # 🎯 Critic Losses
    # ======================
    value_loss = nn.functional.mse_loss(V, R)
    cost_loss = nn.functional.mse_loss(CV, C)

    # ======================
    # 🧠 Optimize
    # ======================
    opt_actor.zero_grad()
    actor_loss.backward()
    opt_actor.step()

    opt_value.zero_grad()
    value_loss.backward()
    opt_value.step()

    opt_cost.zero_grad()
    cost_loss.backward()
    opt_cost.step()

    # ======================
    # 🔧 Update λ (Dual Ascent)
    # ======================
    constraint_violation = (C.mean().item() - constraint_threshold)
    lambda_dual += dual_lr * constraint_violation
    lambda_dual = torch.clamp(lambda_dual, min=0.0)

    # ======================
    # 📊 Logging
    # ======================
    if (ep + 1) % 10 == 0:
        print(f"[Ep {ep+1}] Reward: {total_reward:.1f}, Cost: {total_cost:.2f}, λ: {lambda_dual.item():.3f}, Actor Loss: {actor_loss.item():.3f}")
print("Time taken", time.time() - start_time)
'''df = pd.DataFrame(data_van)
df.to_excel('tvf_and_tcf_data_vanilla.xlsx')
#Save the actors and critic models
torch.save(actor.state_dict(), 'actor_vanilla.pth')
torch.save(value_critic.state_dict(), 'reward_critic_vanilla.pth')
torch.save(cost_critic.state_dict(), 'cost_critic_vanilla.pth')'''

  deprecation(
  deprecation(


NameError: name 'b' is not defined

In [None]:
!wget https://github.com/PKU-Alignment/safety-gymnasium/archive/refs/heads/main.zip
!unzip main.zip
%cd safety-gymnasium-main
!pip install -e .

--2025-07-10 18:08:11--  https://github.com/PKU-Alignment/safety-gymnasium/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/PKU-Alignment/safety-gymnasium/zip/refs/heads/main [following]
--2025-07-10 18:08:11--  https://codeload.github.com/PKU-Alignment/safety-gymnasium/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.114.10
Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [           <=>      ] 551.53M  9.22MB/s    in 28s     

2025-07-10 18:08:39 (19.9 MB/s) - ‘main.zip’ saved [578325046]

Archive:  main.zip
bfa1c945aafcd65a6b568f95d63ed9b2670046ba
   creating: safety-gymnasium-main/
  inflating: sa

In [None]:
!pip install mujoco

Collecting mujoco
  Downloading mujoco-3.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco)
  Downloading glfw-2.9.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading mujoco-3.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.9.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.5/243.5 kB[0m [31m13.4 MB/s[0

In [None]:
!python setup.py

Traceback (most recent call last):
  File "/content/safety-gymnasium-main/setup.py", line 21, in <module>
    from setuptools import setup
ModuleNotFoundError: No module named 'setuptools'


In [None]:
# # Install Python 3.8
# !sudo apt-get update -y
# !sudo apt-get install python3.8 python3.8-dev python3.8-distutils python3.8-gdbm python3.8-venv -y

# # Update symbolic links to use Python 3.8
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
!sudo update-alternatives --config python3

# Verify the Python version
!python3 --version

There are 3 choices for the alternative python3 (providing /usr/bin/python3).

  Selection    Path                 Priority   Status
------------------------------------------------------------
  0            /usr/bin/python3.11   2         auto mode
  1            /usr/bin/python3.10   1         manual mode
  2            /usr/bin/python3.11   2         manual mode
* 3            /usr/bin/python3.8    1         manual mode

Press <enter> to keep the current choice[*], or type selection number: 1
update-alternatives: using /usr/bin/python3.10 to provide /usr/bin/python3 (python3) in manual mode
Python 3.10.12


In [None]:
max(2,3)

3

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import pandas as pd
from copy import deepcopy

if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

# === Hyperparameters ===
gamma = 0.99
hidden_dim = 256
learning_rate = 1e-3
episodes = 1000
lambda_fixed = 20.0
b = 200.0
perturb_eps = 1.0

# === Environment ===
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# === Actor and Critic Networks ===
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, state):
        return self.model(state)

class ValueCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, state):
        return self.model(state)

# === Initialize Networks and Optimizers ===
actor = Actor()
reward_critic = ValueCritic()
cost_critic = ValueCritic()

actor_optim = optim.Adam(actor.parameters(), lr=learning_rate)
reward_optim = optim.Adam(reward_critic.parameters(), lr=learning_rate)
cost_optim = optim.Adam(cost_critic.parameters(), lr=learning_rate)

# === Utilities ===
def add_uniform_noise(state, eps=0.05):
    noise = np.random.uniform(0, eps, size=state.shape)
    return state + noise

def discount(values, gamma):
    result = []
    G = 0
    for v in reversed(values):
        G = v + gamma * G
        result.insert(0, G)
    return torch.FloatTensor(result)

# === Tracking ===
dataF = {'cost': [], 'reward': []}
last_50_actor_params = []

best_reward = float('-inf')
best_actor_state_dict = None

# === Training Loop ===
for ep in range(episodes):
    state = env.reset()
    state = add_uniform_noise(np.array(state), perturb_eps)
    state = torch.FloatTensor(state)

    log_probs = []
    rewards = []
    costs = []
    reward_values = []
    cost_values = []

    total_reward = 0
    total_cost = 0
    done = False

    while not done:
        probs = actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        next_state, reward, done,_ = env.step(action.item())
        next_state = add_uniform_noise(np.array(next_state), perturb_eps)
        next_state = torch.FloatTensor(next_state)

        cost = abs(state[0].item())

        log_probs.append(dist.log_prob(action))
        rewards.append(reward)
        costs.append(cost)
        reward_values.append(reward_critic(state))
        cost_values.append(cost_critic(state))

        total_reward += reward
        total_cost += cost
        state = next_state

    # Discounted returns
    reward_returns = discount(rewards, gamma)
    cost_returns = discount(costs, gamma)

    reward_values = torch.cat(reward_values).squeeze()
    cost_values = torch.cat(cost_values).squeeze()
    log_probs = torch.stack(log_probs)

    adv_r = reward_returns - reward_values.detach()
    adv_c = cost_returns - cost_values.detach()

    chosen_adv = []
    for vr, vc, ar, ac in zip(reward_returns, cost_returns, adv_r, adv_c):
        if vr.item() > lambda_fixed * (vc.item() - b):
            chosen_adv.append(ar)
        else:
            chosen_adv.append(-ac)
    chosen_adv = torch.stack(chosen_adv)

    # Losses
    actor_loss = -(log_probs * chosen_adv).mean()
    reward_loss = nn.functional.mse_loss(reward_values, reward_returns)
    cost_loss = nn.functional.mse_loss(cost_values, cost_returns)

    # Backprop
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    reward_optim.zero_grad()
    reward_loss.backward()
    reward_optim.step()

    cost_optim.zero_grad()
    cost_loss.backward()
    cost_optim.step()

    # Logging
    dataF['cost'].append(total_cost)
    dataF['reward'].append(total_reward)

    # Store for averaging
    if len(last_50_actor_params) >= 50:
        last_50_actor_params.pop(0)
    last_50_actor_params.append(deepcopy(actor.state_dict()))

    # === Track Best Actor ===
    if total_cost < b and total_reward > best_reward:
        best_reward = total_reward
        best_actor_state_dict = deepcopy(actor.state_dict())

    # Display
    if (ep + 1) % 50 == 0:
        print(f"Ep {ep+1} | Reward: {total_reward:.1f} | Cost: {total_cost:.2f} | Actor Loss: {actor_loss.item():.3f} | Best Reward (under cost): {best_reward:.1f}")

# === Save Averaged Actor (last 50 episodes) ===
avg_actor_state_dict = deepcopy(last_50_actor_params[0])
for key in avg_actor_state_dict:
    for i in range(1, len(last_50_actor_params)):
        avg_actor_state_dict[key] += last_50_actor_params[i][key]
    avg_actor_state_dict[key] /= len(last_50_actor_params)

avg_actor = Actor()
avg_actor.load_state_dict(avg_actor_state_dict)

# === Save All Models ===
env.close()
df = pd.DataFrame(dataF)
df.to_excel('tvf_and_tcf_data_with_uncertainity.xlsx')

torch.save(actor.state_dict(), 'actor.pth')                       # Final actor
torch.save(avg_actor.state_dict(), 'actor_avg_last50.pth')       # Averaged actor
torch.save(reward_critic.state_dict(), 'reward_critic.pth')
torch.save(cost_critic.state_dict(), 'cost_critic.pth')

  deprecation(
  deprecation(


Ep 50 | Reward: 38.0 | Cost: 20.08 | Actor Loss: 8.786 | Best Reward (under cost): 78.0
Ep 100 | Reward: 22.0 | Cost: 10.30 | Actor Loss: 1.177 | Best Reward (under cost): 140.0
Ep 150 | Reward: 91.0 | Cost: 55.34 | Actor Loss: 9.554 | Best Reward (under cost): 140.0
Ep 200 | Reward: 55.0 | Cost: 25.28 | Actor Loss: 0.984 | Best Reward (under cost): 151.0
Ep 250 | Reward: 28.0 | Cost: 12.24 | Actor Loss: -4.704 | Best Reward (under cost): 182.0
Ep 300 | Reward: 131.0 | Cost: 74.82 | Actor Loss: 8.327 | Best Reward (under cost): 305.0
Ep 350 | Reward: 180.0 | Cost: 132.37 | Actor Loss: 11.242 | Best Reward (under cost): 305.0
Ep 400 | Reward: 253.0 | Cost: 325.88 | Actor Loss: 9.158 | Best Reward (under cost): 305.0
Ep 450 | Reward: 69.0 | Cost: 26.86 | Actor Loss: -2.054 | Best Reward (under cost): 305.0
Ep 500 | Reward: 57.0 | Cost: 28.74 | Actor Loss: -8.086 | Best Reward (under cost): 305.0
Ep 550 | Reward: 36.0 | Cost: 15.23 | Actor Loss: -13.062 | Best Reward (under cost): 311.0
E