In [1]:
from SumoConnection import SumoConnection
from dotenv import load_dotenv
import os
import traci
import gymnasium as gym
from numpy import inf 
import numpy as np
from Sensors import getSumoSensors_full,len_sensors,len_optimized_sensors
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
import SumoEnv
from config import *
from Utils import show_ppo_progress
from Utils_running import *



In [2]:
load_dotenv("keys.env")
sumo_home = str(os.getenv("sumo_home"))
sumo_binary = str(os.getenv("sumo_binary"))
path_cfg = str(os.getenv("path_cfg"))


In [3]:
sumo_binary = "C:/Program Files (x86)/Eclipse/Sumo/bin/sumo.exe"
cmd=[sumo_binary, "-c", path_cfg, "--log", "sumo_log.txt", "--verbose", "true"]
   

In [4]:
SumoConnection(cmd).reset()



In [5]:
traffic_lights,policies=get_traffic_lights_policies_high_group(20)
#get_traffic_lights_policies_high_group(20)#get_traffic_lights_policies_high_group(60)#get_traffic_lights_policies_full(30,10)

In [6]:
max_steps = 30
n_epsiode = 5
max_sumo_steps = max_steps#n_epsiode*max_steps

In [7]:
n_batch_divide = 1
n_mini_batch_divide = 1
num_epochs = 4 # how often it should go through the whole dataset

train_batch_size = max_steps // n_batch_divide # how much data it should use to train
minibatch_size = minibatch_size = max(1, train_batch_size // n_mini_batch_divide) # how much  minibatch size it train with
#confliction between stpes in epsiodes is normal with RLLib if small

## reset=True, means that the environment will be reset after each episode

In [8]:
import SumoEnv  # Import your module
import importlib  # Import importlib
import rewards

importlib.reload(SumoEnv)  # Reload the module after modification
importlib.reload(rewards)





In [9]:
from ray.tune.registry import register_env
register_env("GroupedSumoEnv", lambda config: SumoEnv.GroupedSumoEnv(cmd,traffic_lights,True,max_steps=max_steps,max_sumo_steps=max_sumo_steps))
register_env("SumoEnv", lambda config: SumoEnv.SumoEnv(cmd,traffic_lights,True,max_steps=max_steps,max_sumo_steps=max_sumo_steps))
register_env("HighGroupedSumoEnv", lambda config: SumoEnv.HighGroupedSumoEnv(cmd,traffic_lights,True,max_steps=max_steps,max_sumo_steps=max_sumo_steps))


In [18]:
loss=[]
reward_ls=[]

In [None]:
!pip install optuna


In [10]:
import optuna

# Initialize Ray
#ray.shutdown()
ray.init(ignore_reinit_error=True)

ENV_NAME = "SumoEnv"  # or "HighGroupedSumoEnv" or "GroupedSumoEnv"

def objective(trial):
    """Optuna objective function to optimize RLlib PPO"""
    ray.init(ignore_reinit_error=True)

    print("Trial:", trial.number)
    # Define hyperparameter search space
    config = (
        PPOConfig()
        .environment(ENV_NAME)
        .framework("torch")  # or "tf"
        .multi_agent(
            policies=policies,  # Define policies
            policy_mapping_fn=lambda agent_id, episode, **kwargs: agent_id,
        )
        .training(
            lr=trial.suggest_loguniform("lr", 1e-5, 1e-2),
            gamma=trial.suggest_uniform("gamma", 0.8, 0.9999),
            lambda_=trial.suggest_uniform("lambda", 0.8, 1.0),
            clip_param=trial.suggest_uniform("clip_param", 0.1, 0.4),
            train_batch_size=trial.suggest_int("train_batch_size", 1000, 4000, step=500),
        )  
        .env_runners(num_env_runners=0) # Set the number of environment runners to 0”
        .api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)  # Use old API stack for now

    )

    # Train using RLlib
    algo = config.build_algo()
    results = algo.train()
    cum_loss, cum_reward=show_ppo_progress(0, results, show_each_agent=False)

    algo.stop()
    ray.shutdown()
    return cum_reward  # Higher reward is better


# Create an Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)




















In [None]:
# Configure PPO
config = (
    PPOConfig()
    .environment("SumoEnv")
    .multi_agent(
        policies=policies,
        policy_mapping_fn=lambda agent_id, episode, **kwargs: agent_id,
    )
    .training(
        minibatch_size=minibatch_size, 
        train_batch_size=train_batch_size,
        num_epochs=num_epochs,
        lr=1e-4,
        gamma=0.95,
        lambda_=0.95,
        clip_param=0.2,
        vf_clip_param=10.0,
        entropy_coeff=0.01,
        kl_target=0.01,
        kl_coeff=0.5,
    )
    .env_runners(num_env_runners=0) # Set the number of environment runners to 0”
    .api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)  # Use old API stack for now
)

    
ray.init(ignore_reinit_error=True)

# Build the PPO trainer
trainer = config.build_algo()


# Training loop
for i in range(n_epsiode):### remind in step it end after 1000 (max i put)
    result = trainer.train()
    policy_rewards = result.get('env_runners', {}).get('policy_reward_mean', {})
    print(f"Episode {i+1}: {policy_rewards}")
    cum_loss, cum_reward=show_ppo_progress(i+1, result, show_each_agent=False)
    loss.append(cum_loss)
    reward_ls.append(cum_reward)
    

















In [22]:
ray.shutdown()


In [20]:
import matplotlib.pyplot as plt

plt.plot(loss, label="Loss")
plt.plot(reward_ls, label="Reward")
plt.xlabel("Epochs")
plt.ylabel("Values")
plt.title("Training Progress")
plt.legend()
plt.show()




In [15]:
traci.close()
ray.shutdown()

In [None]:
#D3QN (Dueling DDQN)
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.utils.replay_buffers import MultiAgentPrioritizedReplayBuffer

# Configure DDQN
config = (
    DQNConfig()
    .environment("SumoEnv")
    .multi_agent(
        policies=policies,
        policy_mapping_fn=lambda agent_id, episode, **kwargs: agent_id,
    )
    .training(
        gamma=0.99,
        lr=1e-3,
        minibatch_size=minibatch_size, 
        train_batch_size=train_batch_size,
        num_epochs=num_epochs,
        target_network_update_freq=8000,
        replay_buffer_config={
            "type": MultiAgentPrioritizedReplayBuffer,
            "capacity": 50000,
        },
        # Enable double Q-learning
        double_q=True,
        dueling=True,
        hiddens=[256, 256],
        #prioritized_replay=True,
        #prioritized_replay_alpha=0.6,
        #prioritized_replay_beta=0.4,
        #final_prioritized_replay_beta=1.0,
        #prioritized_replay_eps=1e-6,
    )
    .env_runners(num_env_runners=0)  # Set the number of environment runners to 0
    #.rollouts(num_rollout_workers=0)  # No rollout workers when running locally
    # If using the old API stack:
    .api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)
)

ray.init(ignore_reinit_error=True) 

# Build the DQN trainer
trainer = config.build_algo()

# Training loop
loss = []
rewards = []
for i in range(n_epsiode):### remind in step it end after 1000 (max i put)
    result = trainer.train()
    print(result)
    policy_rewards = result.get('env_runners', {}).get('policy_reward_mean', {})
    print(f"Episode {i+1}: {policy_rewards}")
    #cum_loss, cum_reward=show_ppo_progress(i+1, result, show_each_agent=False)
    loss.append(cum_loss)
    reward_ls.append(cum_reward)