In [None]:
#!pip install torch torchvision torchaudio gymnasium pandas numpy matplotlib tqdm mpmath==1.2.1

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("train_data.csv")

# Drop timestamp or non-numeric columns if they exist
if 'timestamp' in data.columns:
    data = data.drop(columns=['timestamp'])

# Ensure numeric and fill missing values
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)

print("Shape:", data.shape)
data.head()


In [None]:
class TrafficEnv(gym.Env):
    def __init__(self, data):
        super(TrafficEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.max_index = len(data) - 1

        # Define spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(len(data.columns)-1,), dtype=np.float32)
        self.action_space = spaces.Discrete(4)  # 4 signal phases

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # Start from a random point in dataset
        self.current_step = np.random.randint(0, self.max_index - 50)
        obs = self.data.iloc[self.current_step, :-1].values.astype(np.float32)
        info = {}
        return obs, info

    def step(self, action):
        self.current_step += 1
        done = self.current_step >= self.max_index

        row = self.data.iloc[self.current_step]
        wait_time = row.get('waiting_time', 0)
        queue_len = row.get('queue_length', 0)
        emergency = row.get('emergency_detected', 0) if 'emergency_detected' in row else 0

        reward = self.compute_reward(wait_time, queue_len, emergency, action)
        next_obs = row[:-1].values.astype(np.float32)
        truncated = False
        info = {}

        return next_obs, reward, done, truncated, info

    def compute_reward(self, wait_time, queue_len, emergency_detected, action):
        reward = - (0.7 * wait_time + 0.3 * queue_len)
        if emergency_detected and action == 0:  # example: NS-green helps emergency
            reward += 20
        return reward

    def render(self):
        pass


In [None]:
'''def compute_reward(wait_time, queue_len, emergency_detected):

    reward = - (0.7 * wait_time + 0.3 * queue_len)
    if emergency_detected:
        reward += 20  # positive reward for prioritizing emergency
    return reward'''


In [None]:
class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, lr=3e-4):
        super(PPOAgent, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

    def act(self, state):
        state = torch.FloatTensor(state)
        probs = self.actor(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def evaluate(self, state, action):
        probs = self.actor(state)
        dist = torch.distributions.Categorical(probs)
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return log_prob, self.critic(state), entropy


In [None]:
env = TrafficEnv(data)
agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)


import torch
import json
import os
from tqdm import tqdm

def train_ppo(
    episodes=200,
    gamma=0.99,
    clip_epsilon=0.2,
    save_path="ppo_traffic_model_latest.pt",
    rewards_path="reward_log.json"
):
    reward_history = []

    # Resume from saved rewards if file exists
    if os.path.exists(rewards_path):
        with open(rewards_path, "r") as f:
            reward_history = json.load(f)
        print(f"Loaded {len(reward_history)} previous rewards from '{rewards_path}'")

    # Continue training for new episodes
    for episode in tqdm(range(len(reward_history), len(reward_history) + episodes)):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, log_prob = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)

            # PPO update
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            reward_tensor = torch.tensor(reward, dtype=torch.float32)

            value = agent.critic(state_tensor)
            next_value = agent.critic(next_state_tensor)
            advantage = reward_tensor + gamma * next_value - value

            new_log_prob, _, entropy = agent.evaluate(
                state_tensor, torch.tensor([action], dtype=torch.long)
            )

            ratio = (new_log_prob - log_prob).exp()
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage.detach()
            loss = -torch.min(surr1, surr2) + 0.5 * advantage.pow(2) - 0.01 * entropy

            agent.optimizer.zero_grad()
            loss.mean().backward()
            agent.optimizer.step()

            total_reward += reward
            state = next_state

        reward_history.append(total_reward)
        print(f"Episode {episode + 1} | Total Reward: {total_reward:.2f}")

        # Save model & rewards every 10 episodes
        if (episode + 1) % 10 == 0:
            torch.save(agent.state_dict(), save_path)
            with open(rewards_path, "w") as f:
                json.dump(reward_history, f)
            print(f"üíæ Progress saved ‚Äî {episode + 1} episodes complete")

    print("Training complete!")
    torch.save(agent.state_dict(), save_path)
    with open(rewards_path, "w") as f:
        json.dump(reward_history, f)

    return reward_history


rewards = train_ppo(episodes=500)


In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_500.pt")
print("Model saved successfully!")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json")

In [None]:
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("PPO Training Progress for Traffic Signal Control")
plt.show()


In [None]:
import pandas as pd
plt.plot(pd.Series(rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()


In [None]:
!pip install stable_baselines3

# Re-initialize the PPOAgent model structure
loaded_agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)

# Load the state dictionary into the agent
loaded_agent.load_state_dict(torch.load("ppo_traffic_model_500.pt"))
loaded_agent.eval() # Set the model to evaluation mode

print("Model loaded successfully!")

In [None]:
print("Continuing training for 500 more episodes...")
new_rewards = train_ppo(episodes=500)
rewards.extend(new_rewards)
print("Training complete.")

In [None]:
import pandas as pd
plt.plot(pd.Series(rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_1000.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_1000.pt")

In [None]:
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("PPO Training Progress for Traffic Signal Control (Extended)")
plt.show()

In [None]:
!pip install stable_baselines3

# Re-initialize the PPOAgent model structure
loaded_agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)

# Load the state dictionary into the agent
loaded_agent.load_state_dict(torch.load("ppo_traffic_model_1000.pt"))
loaded_agent.eval() # Set the model to evaluation mode

print("Model loaded successfully!")

In [None]:
env = TrafficEnv(data)
agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)


import torch
import json
import os
from tqdm import tqdm

def train_ppo(
    episodes=200,
    gamma=0.99,
    clip_epsilon=0.2,
    save_path="ppo_traffic_model_latest.pt",
    rewards_path="reward_log.json"
):
    reward_history = []

    # Resume from saved rewards if file exists
    if os.path.exists(rewards_path):
        with open(rewards_path, "r") as f:
            reward_history = json.load(f)
        print(f"‚úÖ Loaded {len(reward_history)} previous rewards from '{rewards_path}'")

    # Continue training for new episodes
    for episode in tqdm(range(len(reward_history), len(reward_history) + episodes)):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, log_prob = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)

            # PPO update
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            reward_tensor = torch.tensor(reward, dtype=torch.float32)

            value = agent.critic(state_tensor)
            next_value = agent.critic(next_state_tensor)
            advantage = reward_tensor + gamma * next_value - value

            new_log_prob, _, entropy = agent.evaluate(
                state_tensor, torch.tensor([action], dtype=torch.long)
            )

            ratio = (new_log_prob - log_prob).exp()
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage.detach()
            loss = -torch.min(surr1, surr2) + 0.5 * advantage.pow(2) - 0.01 * entropy

            agent.optimizer.zero_grad()
            loss.mean().backward()
            agent.optimizer.step()

            total_reward += reward
            state = next_state

        reward_history.append(total_reward)
        print(f"Episode {episode + 1} | Total Reward: {total_reward:.2f}")

        # Save model & rewards every 10 episodes
        if (episode + 1) % 10 == 0:
            torch.save(agent.state_dict(), save_path)
            with open(rewards_path, "w") as f:
                json.dump(reward_history, f)
            print(f"üíæ Progress saved ‚Äî {episode + 1} episodes complete")

    print("Training complete!")
    torch.save(agent.state_dict(), save_path)
    with open(rewards_path, "w") as f:
        json.dump(reward_history, f)

    return reward_history

In [None]:
agent.load_state_dict(torch.load("ppo_traffic_model_1000.pt"))


In [None]:
''' rewards: rewards+ new_rewards+rewards_more'''

In [None]:
print("Continuing training for 500 more episodes...")
new_rewards = train_ppo(episodes=500)
rewards.extend(new_rewards)
print("Training complete.")

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_1500.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_1500.pt")

In [None]:
import pandas as pd
plt.plot(pd.Series(new_rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()


In [None]:
#all_rewards=rewards+ new_rewards+rewards_more

In [None]:
plt.plot(rewards)
plt.title("Raw Rewards (Noisy)-200")
plt.plot(all_rewards)
plt.title("Raw Rewards (Noisy)-600")


In [None]:
import pandas as pd
plt.plot(pd.Series(rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-Episode Moving Average)")


In [None]:
!pip install stable_baselines3

# Re-initialize the PPOAgent model structure
loaded_agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)

# Load the state dictionary into the agent
loaded_agent.load_state_dict(torch.load("ppo_traffic_model_1500.pt"))
loaded_agent.eval() # Set the model to evaluation mode

print("Model loaded successfully!")

In [None]:
env = TrafficEnv(data)
agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)



In [None]:
agent.load_state_dict(torch.load("ppo_traffic_model_1500.pt"))
print("Loaded model from 1500-episode checkpoint")


In [None]:
print("Current learning rate:")
for g in agent.optimizer.param_groups:
    print(g['lr'])


In [None]:
for g in agent.optimizer.param_groups:
    g['lr'] = 1e-4  # or 5e-5 if training is still noisy

print("Learning rate successfully changed!")


In [None]:
print("Current learning rate:")
for g in agent.optimizer.param_groups:
    print(g['lr'])

In [None]:
import os # Add this line
import json # Add this line if not already imported

def train_ppo(
    episodes=200,
    gamma=0.99,
    clip_epsilon=0.2,
    save_path="ppo_traffic_model_latest.pt",
    rewards_path="reward_log.json"
):
    reward_history = []

    # Resume from saved rewards if file exists
    if os.path.exists(rewards_path):
        with open(rewards_path, "r") as f:
            reward_history = json.load(f)
        print(f"‚úÖ Loaded {len(reward_history)} previous rewards from '{rewards_path}'")

    # Continue training for new episodes
    for episode in tqdm(range(len(reward_history), len(reward_history) + episodes)):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, log_prob = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)

            # PPO update
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            reward_tensor = torch.tensor(reward, dtype=torch.float32)

            value = agent.critic(state_tensor)
            next_value = agent.critic(next_state_tensor)
            advantage = reward_tensor + gamma * next_value - value

            new_log_prob, _, entropy = agent.evaluate(
                state_tensor, torch.tensor([action], dtype=torch.long)
            )

            ratio = (new_log_prob - log_prob).exp()
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage.detach()
            loss = -torch.min(surr1, surr2) + 0.5 * advantage.pow(2) - 0.01 * entropy

            agent.optimizer.zero_grad()
            loss.mean().backward()
            agent.optimizer.step()

            total_reward += reward
            state = next_state

        reward_history.append(total_reward)
        print(f"Episode {episode + 1} | Total Reward: {total_reward:.2f}")

        # Save model & rewards every 10 episodes
        if (episode + 1) % 10 == 0:
            torch.save(agent.state_dict(), save_path)
            with open(rewards_path, "w") as f:
                json.dump(reward_history, f)
            print(f"üíæ Progress saved ‚Äî {episode + 1} episodes complete")

    print("Training complete!")
    torch.save(agent.state_dict(), save_path)
    with open(rewards_path, "w") as f:
        json.dump(reward_history, f)

    return reward_history  # <- ‚úÖ should be on its own line!

In [None]:
# Training with new Learning Rate:

In [None]:
print("Continuing training for 500 more episodes with DIFFERENT learning rate:")
newlr_rewards = train_ppo(episodes=500)
rewards.extend(newlr_rewards)
print("Training complete.")

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_2000.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_2000.pt")

In [None]:
import pandas as pd
plt.plot(pd.Series(newlr_rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()

In [None]:
print("Continuing training for 500 more episodes with DIFFERENT learning rate:")
newlr2_rewards = train_ppo(episodes=500)
rewards.extend(newlr2_rewards)
print("Training complete.")

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_2500.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_2500.pt")

In [None]:
import pandas as pd
plt.plot(pd.Series(newlr2_rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json, os
from tqdm import tqdm


env = TrafficEnv(data)

# Load agent with same architecture
agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)

# Load previous trained weights if available
model_path = "/content/ppo_traffic_model_2500.pt"
if os.path.exists(model_path):
    agent.load_state_dict(torch.load(model_path))
    print("‚úî Loaded existing trained model:", model_path)
else:
    print("‚ö† No previous model found ‚Äî training will start fresh.")


# 2Ô∏è‚É£ PPO Evaluation function (NO EXPLORATION)
def evaluate(agent, env, episodes=10):
    rewards = []
    for _ in range(episodes):
        s, _ = env.reset()
        done = False
        total = 0
        while not done:
            a, _ = agent.act(s)   # greedy action (actor takes highest prob)
            s, r, done, truncated, _ = env.step(a)
            total += r
        rewards.append(total)
    return np.mean(rewards)


# 3Ô∏è‚É£ TRAINING FUNCTION (NEW)
def train_ppo_next_phase(
    episodes=1000,
    gamma=0.99,
    clip_epsilon=0.2,
    save_path="ppo_traffic_model_latest.pt",
    rewards_path="reward_log.json"
):
    # Load previous reward log
    reward_history = []
    if os.path.exists(rewards_path):
        with open(rewards_path, "r") as f:
            reward_history = json.load(f)
        print(f"‚úî Loaded {len(reward_history)} previous reward entries.")

    # LR scheduler ‚Äî reduces LR when improvement plateaus
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        agent.optimizer,
        mode='max',
        factor=0.5,     # Halve the LR
        patience=5,     # Wait 5 eval cycles
        min_lr=1e-6
        # verbose=True  # Removed verbose argument
    )

    print("\n Starting Next-Phase Training...\n")

    for ep in tqdm(range(episodes)):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, log_prob = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)

            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            reward_tensor = torch.tensor(reward, dtype=torch.float32)

            value = agent.critic(state_tensor)
            next_value = agent.critic(next_state_tensor)
            advantage = reward_tensor + gamma * next_value - value

            new_log_prob, _, entropy = agent.evaluate(
                state_tensor, torch.tensor([action], dtype=torch.long)
            )

            ratio = (new_log_prob - log_prob).exp()
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage.detach()
            loss = -torch.min(surr1, surr2) + 0.5 * advantage.pow(2) - 0.01 * entropy

            agent.optimizer.zero_grad()
            loss.mean().backward()
            agent.optimizer.step()

            total_reward += reward
            state = next_state

        reward_history.append(total_reward)
        print(f"Episode {len(reward_history)} | Reward: {total_reward:.2f}")

        # Every 100 episodes ‚Üí evaluate & adjust LR
        if (ep + 1) % 100 == 0:
            eval_reward = evaluate(agent, env, episodes=5)
            print(f"üß™ Evaluation Reward after {len(reward_history)} eps: {eval_reward:.2f}")

            scheduler.step(eval_reward)

            # Save progress
            torch.save(agent.state_dict(), save_path)
            with open(rewards_path, "w") as f:
                json.dump(reward_history, f)
            print("üíæ Autosaved model + rewards.")

    # final save
    torch.save(agent.state_dict(), save_path)
    with open(rewards_path, "w") as f:
        json.dump(reward_history, f)

    print("\nüéâ Training Phase Complete!")
    return reward_history


# 4Ô∏è‚É£ Run training
new_rewards = train_ppo_next_phase(episodes=1000)

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_3500.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_3500.pt")

In [None]:
import pandas as pd
plt.plot(pd.Series(newlr2_rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()

In [None]:
# TUNING the model:

In [None]:
class TrafficEnv(gym.Env):
    def __init__(self, data):
        super(TrafficEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.max_index = len(data) - 1

        # Define spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(len(data.columns)-1,), dtype=np.float32)
        self.action_space = spaces.Discrete(4)  # 4 signal phases

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # Start from a random point in dataset
        self.current_step = np.random.randint(0, self.max_index - 50)
        obs = self.data.iloc[self.current_step, :-1].values.astype(np.float32)
        info = {}
        return obs, info

    def step(self, action):
        self.current_step += 1
        done = self.current_step >= self.max_index

        row = self.data.iloc[self.current_step]
        wait_time = row.get('waiting_time', 0)
        queue_len = row.get('queue_length', 0)
        emergency = row.get('emergency_detected', 0) if 'emergency_detected' in row else 0

        reward = self.compute_reward(wait_time, queue_len, emergency, action)
        next_obs = row[:-1].values.astype(np.float32)
        truncated = False
        info = {}

        return next_obs, reward, done, truncated, info

    def compute_reward(self, wait_time, queue_len, emergency_detected, action):
        wait_time = wait_time / 100
        queue_len = queue_len / 50
        reward = - (0.6 * wait_time + 0.4 * queue_len)
        if emergency_detected:
          reward += 10
        return reward
        '''
        reward = - (0.7 * wait_time + 0.3 * queue_len)
        if emergency_detected and action == 0:  # example: NS-green helps emergency
            reward += 20
        return reward'''

    def render(self):
        pass


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json, os
from tqdm import tqdm


env = TrafficEnv(data)

# Load agent with same architecture
agent = PPOAgent(state_dim=len(data.columns)-1, action_dim=4)

# Load previous trained weights if available
model_path = "/content/ppo_traffic_model_3500.pt"
if os.path.exists(model_path):
    agent.load_state_dict(torch.load(model_path))
    print("‚úî Loaded existing trained model:", model_path)
else:
    print("‚ö† No previous model found ‚Äî training will start fresh.")


# 2Ô∏è‚É£ PPO Evaluation function (NO EXPLORATION)
def evaluate(agent, env, episodes=10):
    rewards = []
    for _ in range(episodes):
        s, _ = env.reset()
        done = False
        total = 0
        while not done:
            a, _ = agent.act(s)   # greedy action (actor takes highest prob)
            s, r, done, truncated, _ = env.step(a)
            total += r
        rewards.append(total)
    return np.mean(rewards)


# 3Ô∏è‚É£ TRAINING FUNCTION (NEW)
def train_ppo_next_phase(
    episodes=1000,
    gamma=0.99,
    clip_epsilon=0.2,
    save_path="ppo_traffic_model_latest.pt",
    rewards_path="reward_log.json"
):
    # Load previous reward log
    reward_history = []
    if os.path.exists(rewards_path):
        with open(rewards_path, "r") as f:
            reward_history = json.load(f)
        print(f"‚úî Loaded {len(reward_history)} previous reward entries.")

    # LR scheduler ‚Äî reduces LR when improvement plateaus
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        agent.optimizer,
        mode='max',
        factor=0.5,     # Halve the LR
        patience=5,     # Wait 5 eval cycles
        min_lr=1e-6
        # verbose=True  # Removed verbose argument
    )

    print("\n Starting Next-Phase Training...\n")

    for ep in tqdm(range(episodes)):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, log_prob = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)

            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            reward_tensor = torch.tensor(reward, dtype=torch.float32)

            value = agent.critic(state_tensor)
            next_value = agent.critic(next_state_tensor)
            advantage = reward_tensor + gamma * next_value - value

            new_log_prob, _, entropy = agent.evaluate(
                state_tensor, torch.tensor([action], dtype=torch.long)
            )

            ratio = (new_log_prob - log_prob).exp()
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage.detach()
            loss = -torch.min(surr1, surr2) + 0.5 * advantage.pow(2) - 0.01 * entropy

            agent.optimizer.zero_grad()
            loss.mean().backward()
            agent.optimizer.step()

            total_reward += reward
            state = next_state

        reward_history.append(total_reward)
        print(f"Episode {len(reward_history)} | Reward: {total_reward:.2f}")

        # Every 100 episodes ‚Üí evaluate & adjust LR
        if (ep + 1) % 100 == 0:
            eval_reward = evaluate(agent, env, episodes=5)
            print(f"üß™ Evaluation Reward after {len(reward_history)} eps: {eval_reward:.2f}")

            scheduler.step(eval_reward)

            # Save progress
            torch.save(agent.state_dict(), save_path)
            with open(rewards_path, "w") as f:
                json.dump(reward_history, f)
            print("üíæ Autosaved model + rewards.")

    # final save
    torch.save(agent.state_dict(), save_path)
    with open(rewards_path, "w") as f:
        json.dump(reward_history, f)

    print("\nüéâ Training Phase Complete!")
    return reward_history


# 4Ô∏è‚É£ Run training
new_rewards = train_ppo_next_phase(episodes=500)

In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_4000.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_4000.pt")

In [None]:
import pandas as pd
plt.plot(pd.Series(new_rewards).rolling(20).mean(), color='blue', linewidth=2)
plt.title("Smoothed Rewards (20-episode Moving Average)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (Smoothed)")
plt.show()

In [None]:
# Re-create agent with smaller LR
agent = PPOAgent(
    state_dim=len(data.columns)-1,
    action_dim=4,
    lr=5e-5      # ‚Üê NEW learning rate
)

# Load previous trained model (4000 eps)
model_path = "ppo_traffic_model_3500.pt"
agent.load_state_dict(torch.load(model_path))
print("‚úî Loaded model with new LR for fine-tuning")


In [None]:
print("Current learning rate:")
for g in agent.optimizer.param_groups:
    print(g['lr'])

In [None]:
new_rewards = train_ppo_next_phase(episodes=500)


In [None]:
torch.save(agent.state_dict(), "ppo_traffic_model_4500.pt")


In [None]:
print("Models saved: ppo_traffic_model_latest.pt, reward_log.json, ppo_traffic_model_4500.pt")