# IMPORTING LIBRARIES

In [1]:
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from sb3_contrib.common.wrappers import ActionMasker
from deterioration_model import bridgedeterioration
import numpy as np
import gymnasium as gym
import torch as th

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG)  # or logging.INFO

# Initialize environment and vectorize them

In [None]:
env = bridgedeterioration(df_path='bridge_data.csv')

# Wrap the environment with DummyVecEnv
env = DummyVecEnv([lambda: env])

# Normalize the observations (Improves stability while training)
norm_env = VecNormalize(env, norm_obs=True, norm_reward= False, clip_obs= float('inf'))

# Model training based on simple PPO algorithm

In [4]:
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[128, 128])
norm_env.reset()
model = PPO("MlpPolicy", norm_env, n_epochs = 10, learning_rate=0.0003, batch_size = 64, ent_coef=0.01, gamma = 0.99, policy_kwargs= policy_kwargs,verbose=2)
model.learn(total_timesteps=100000, log_interval=10)

Using cpu device
------------------------------------------
| time/                   |              |
|    fps                  | 139          |
|    iterations           | 10           |
|    time_elapsed         | 146          |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0037288873 |
|    clip_fraction        | 0.0131       |
|    clip_range           | 0.2          |
|    entropy_loss         | -26.8        |
|    explained_variance   | 0.715        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.29e+05     |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0154      |
|    value_loss           | 3.08e+05     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 135          |
|    iterations           | 20           |
|    time_elapsed         | 302      

<stable_baselines3.ppo.ppo.PPO at 0x25b9c582490>

# To perform 50 steps and check the actions taken by the model trained

In [None]:
norm_env.training = False
obs = norm_env.reset()

# Get the maximum steps from the underlying environment instance
# Assumes only one environment in the VecEnv
max_steps = 50
num_bridges = norm_env.envs[0].num_bridges

steps = 0
done = np.array([False]) # Initialize done flag for the VecEnv

fp_history = []

while not done.all() and steps < max_steps:
    # Get action from the model using deterministic prediction for evaluation
    action, _ = model.predict(obs, deterministic=True)

    print(f"Step: {steps + 1}, Action: {action[0]}")

    # Take the action in the environment
    obs, reward, done, info = norm_env.step(action)

    unnormalized_state = info[0]['state']
    failure_probs = unnormalized_state[:, 5]
    fp_history.append(failure_probs.copy()) # Store if needed

    # Format for better readability
    fp_str = np.array2string(failure_probs, precision=4, suppress_small=True, max_line_width=120)
    print(f"\t Failure Probabilities: {fp_str}")

    steps += 1

Step: 1, Action: [3 1 1 2 2 5 4 3 2 1 2 2 3 0 1]
	 Failure Probabilities: [0.0926 0.0159 0.0299 0.1754 0.1361 0.0002 0.2864 0.0002 0.0564 0.3945 0.1553 0.1963 0.5605 0.346  0.0017]
Step: 2, Action: [0 1 1 2 2 0 4 2 1 1 2 2 3 0 1]
	 Failure Probabilities: [0.1007 0.0201 0.0355 0.024  0.0098 0.0008 0.0002 0.0002 0.063  0.4067 0.0161 0.0333 0.1092 0.3581 0.0039]
Step: 3, Action: [0 1 2 0 2 0 4 2 1 1 2 2 3 0 1]
	 Failure Probabilities: [0.1092 0.0248 0.0003 0.0285 0.0002 0.0018 0.0002 0.0002 0.0699 0.4188 0.0002 0.0002 0.0002 0.3702 0.0069]
Step: 4, Action: [0 1 2 0 2 0 4 2 1 1 2 2 3 0 2]
	 Failure Probabilities: [0.1179 0.0299 0.0003 0.0333 0.0002 0.0032 0.0002 0.0002 0.0771 0.4309 0.0002 0.0002 0.0002 0.3824 0.0004]
Step: 5, Action: [0 1 2 0 2 0 4 2 1 1 2 2 3 0 2]
	 Failure Probabilities: [0.1269 0.0355 0.0003 0.0386 0.0002 0.005  0.0002 0.0002 0.0847 0.443  0.0002 0.0002 0.0002 0.3945 0.0004]
Step: 6, Action: [0 1 2 0 2 0 4 2 1 1 2 2 3 0 2]
	 Failure Probabilities: [0.1361 0.0415 0.0003

# Feel free to change the reward model to account for different condition and save the model for later use