In [1]:
pip install stable-baselines3 gym

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable-baselines3-2.3

In [4]:
pip install shimmy

  and should_run_async(code)


Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting gymnasium>=1.0.0a1 (from shimmy)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gymnasium, shimmy
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 0.29.1
    Uninstalling gymnasium-0.29.1:
      Successfully uninstalled gymnasium-0.29.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stable-baselines3 2.3.2 requires gymnasium<0.30,>=0.28.1, but you have gymnasium 1.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-1.0.0 shimmy-2.0.0


In [5]:
import numpy as np
import gym
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecNormalize

# Custom environment for Battery Consumption with Energy Thermal Management
class BatteryThermalEnv(gym.Env):
    def __init__(self):
        super(BatteryThermalEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)  # Action: power usage (scaled)
        self.observation_space = gym.spaces.Box(low=np.array([0, 0, 0]), high=np.array([100, 120, 80]), dtype=np.float32)  # Battery, speed, temp
        self.state = [100, 50, 30]  # Initial state: [battery level, speed, temperature]

    def reset(self):
        self.state = [100, 50, 30]  # Reset to full battery, moderate speed, moderate temp
        return np.array(self.state, dtype=np.float32)

    def step(self, action):
        battery, speed, temp = self.state
        power_consumption = action[0] * 100  # Scale power consumption
        cooling_effect = max(0, 1 - temp / 100)  # Cooling effect based on temp
        thermal_penalty = 0.1 * max(0, temp - 40)  # Extra penalty if temp > 40

        # Update battery level, speed, temperature
        battery -= power_consumption * (1 + speed / 100 + thermal_penalty)
        battery = max(battery, 0)

        speed = np.clip(speed + np.random.uniform(-5, 5), 0, 120)
        temp = np.clip(temp + np.random.uniform(-1, 1) * (power_consumption / 100), 0, 80)

        self.state = [battery, speed, temp]

        # Reward is based on minimizing power consumption and avoiding high temp penalties
        reward = -power_consumption - thermal_penalty * 10
        done = battery <= 0

        return np.array(self.state, dtype=np.float32), reward, done, {}

# Initialize environment
env = BatteryThermalEnv()
env = DummyVecEnv([lambda: env])  # Wrap environment for vectorized execution
env = VecNormalize(env, norm_reward=True)

# Parameters for PPO-Penalty algorithm
lambda_param = 1.5
kl_target = 0.01
kl_max = 0.03
kl_min = 0.005
kappa = 0.2

# Create and train PPO model
print("Training PPO...")

# Define model with PPO-Clip or PPO-Penalty
ppo_model = PPO("MlpPolicy", env, verbose=1, ent_coef=0.01)

# Training loop with PPO-Clip and PPO-Penalty
num_episodes = 50
steps_per_episode = 100
for episode in range(num_episodes):
    obs = env.reset()

    for step in range(steps_per_episode):
        # Take an action using the PPO policy
        action, _states = ppo_model.predict(obs, deterministic=True)
        obs, reward, done, _ = env.step(action)

        # Update networks
        ppo_model.learn(total_timesteps=100, log_interval=10)  # Increase total_timesteps as needed

        # Calculate KL divergence for PPO-Penalty
        if hasattr(ppo_model, 'policy'):
            policy = ppo_model.policy
            old_policy = ppo_model.policy_old
            kl_divergence = torch.distributions.kl.kl_divergence(policy.dist, old_policy.dist).mean().item()

            # Apply PPO-Penalty adjustments
            if kl_divergence > kl_max:
                kappa *= lambda_param
            elif kl_divergence < kl_min:
                kappa /= lambda_param

        # Check if done, then reset
        if done:
            obs = env.reset()

print("Training completed.")

# Test PPO model on the environment
def test_model(model, env, num_episodes=10):
    rewards = []
    for episode in range(num_episodes):
        obs = env.reset()
        total_reward = 0
        done = False
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        rewards.append(total_reward)
    avg_reward = np.mean(rewards)
    print(f"Average reward over {num_episodes} episodes: {avg_reward}")
    return avg_reward

# Run test and comparison
ppo_reward = test_model(ppo_model, env)

# Output PPO model's performance
print(f"PPO Model Average Reward: {ppo_reward}")


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Training PPO...
Using cpu device


AttributeError: 'PPO' object has no attribute 'policy_old'

In [12]:
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

# Custom environment for Battery Consumption with Energy Thermal Management
class BatteryThermalEnv(gym.Env):
    def __init__(self):
        super(BatteryThermalEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)  # Action: power usage (scaled)
        self.observation_space = gym.spaces.Box(
            low=np.array([0, 0, 0]),
            high=np.array([100, 120, 80]),
            dtype=np.float32
        )  # Battery, speed, temp
        self.state = [100, 50, 30]  # Initial state: [battery level, speed, temperature]

    def reset(self):
        self.state = [100, 50, 30]  # Reset to full battery, moderate speed, moderate temp
        return np.array(self.state, dtype=np.float32)

    def step(self, action):
        battery, speed, temp = self.state
        power_consumption = action[0] * 100  # Scale power consumption
        cooling_effect = max(0, 1 - temp / 100)  # Cooling effect based on temp
        thermal_penalty = 0.1 * max(0, temp - 40)  # Extra penalty if temp > 40

        # Update battery level, speed, temperature
        battery -= power_consumption * (1 + speed / 100 + thermal_penalty)
        battery = max(battery, 0)

        speed = np.clip(speed + np.random.uniform(-5, 5), 0, 120)
        temp = np.clip(temp + np.random.uniform(-1, 1) * (power_consumption / 100), 0, 80)

        self.state = [battery, speed, temp]

        # Reward is based on minimizing power consumption and avoiding high temp penalties
        reward = -power_consumption - thermal_penalty * 10
        done = battery <= 0

        return np.array(self.state, dtype=np.float32), reward, done, {}

# Initialize environment
env = BatteryThermalEnv()
env = DummyVecEnv([lambda: env])  # Wrap environment for vectorized execution
env = VecNormalize(env, norm_reward=True)

# Create and train PPO model
print("Training PPO...")

ppo_model = PPO("MlpPolicy", env, verbose=1, ent_coef=0.01)

# Train the model
ppo_model.learn(total_timesteps=10000)  # Increase timesteps as needed
print("Training completed.")

# Test PPO model on the environment
def test_model(model, env, num_episodes=10):
    rewards = []
    for episode in range(num_episodes):
        obs = env.reset()
        total_reward = 0
        done = False
        step=0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            step+=1
            if done or step>10000:
              break
        rewards.append(total_reward)
    avg_reward = np.mean(rewards)
    print(f"Average reward over {num_episodes} episodes: {avg_reward}")
    return avg_reward

# Run test and comparison
ppo_reward = test_model(ppo_model, env)

# Output PPO model's performance
print(f"PPO Model Average Reward: {ppo_reward}")


Training PPO...
Using cpu device
-----------------------------
| time/              |      |
|    fps             | 944  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 618        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.03685519 |
|    clip_fraction        | 0.207      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.37      |
|    explained_variance   | -0.0589    |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0232     |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0282    |
|    std                  | 0.912      |
|    value_loss           | 1.17       |
-------------------