# Import dependencies

In [49]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

import numpy as np
import random
import os

from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Building an enviroment

The ResourceManagementEnv class implements the resource management environment. 

Resources are represented as a vector of numeric values that can range from 0 to a predefined maximum value. The goal of the agent is to allocate the resources to achieve the desired resource configuration (goal_resources).

In [84]:
class EnergySavingEnv(gym.Env):
    def __init__(self):
        # Define the action space
        self.action_space = Discrete(3)  # Increase(1), Decrease(-1) or Keep(0) actions

        # Define the observation space
        self.observation_space = Box(low=0, high=100, shape=(1,))  # Energy level ranging from 0 to 100

        # Define the initial energy level
        self.energy_level = random.randint(50, 80)

        # Define the resource manager properties
        self.energy_capacity = 100  # Maximum energy capacity
        self.energy_threshold = 60  # Energy threshold for energy-saving mode
        self.energy_saving_factor = 0.5  # Factor by which energy is saved when in energy-saving mode
        self.energy_usage = 5  # Energy used per time step

        # Define the number of time steps for the resource manager 
        self.time_steps = 100

        # Initialize the current time step
        self.current_step = 0

    def step(self, action):
        # Apply the action to change the energy level
        if action == 0:  # Decrease energy
            self.energy_level -= self.energy_usage
        elif action == 1:  # Keep energy level (no change)
            pass
        elif action == 2:  # Increase energy
            self.energy_level += self.energy_usage

        # Clip energy level within the valid range
        self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        # Calculate reward based on energy-saving state
        if self.energy_level >= self.energy_threshold:
            reward = 1
        else:
            reward = -1

        # Check if the episode is done
        done = self.current_step >= self.time_steps

        # Increment the current time step
        self.current_step += 1

        # Additional info can be an empty dictionary
        info = {}
        truncated = False

        return self.energy_level, reward, done, truncated, info

    def reset(self, seed=None):
        # Reset the environment to the initial state
        self.energy_level = np.array([random.randint(50, 80)]).astype(float)
        self.current_step = 0

        return self.energy_level, {}

    def render(self):
        # Implement visualization (optional)
        pass

# Test enviroment

In [94]:
# Custom Enviroment for energy saving
env_energy_saving = EnergySavingEnv()

In [68]:
env_energy_saving.observation_space.sample()

array([57.10221], dtype=float32)

In [69]:
env_energy_saving.action_space.sample()

2

In [70]:
env_energy_saving.reset()

(array([52.]), {})

In [86]:
episodes = 5
for episode in range(1, episodes+1):
  obs = env_energy_saving.reset()
  done = False
  score = 0

  while not done:
    env_energy_saving.render()
    action = env_energy_saving.action_space.sample()
    obs, reward, done, truncated, info = env_energy_saving.step(action)
    score += reward

  print('Episode: {} Score {}'.format(episode, score))
#env.close()

Episode: 1 Score 97
Episode: 2 Score -11
Episode: 3 Score 95
Episode: 4 Score -75
Episode: 5 Score -57


# Train model

* PPO is a popular choice for environments with continuous actions and continuous observation spaces, like my resource management environment.
* For environments with continuous actions, I gonna use the MlpPolicy policy, which is a neural network policy with fully connected layers (multilayer perceptron).

In [95]:
log_path = os.path.join('Training', 'Logs')

In [96]:
energy_saving_model = DQN('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [97]:
energy_saving_model.learn(total_timesteps=500000)

Logging to Training\Logs\DQN_3


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | 64.5     |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4149     |
|    time_elapsed     | 0        |
|    total_timesteps  | 404      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | 58.8     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5125     |
|    time_elapsed     | 0        |
|    total_timesteps  | 808      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | 57.2     |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x18556740b20>

# Save model

In [98]:
path = os.path.join('Training', 'Saved Models', f'DQN_500k_energy_saving_model')
energy_saving_model.save(path)

In [99]:
del energy_saving_model

In [100]:
energy_model = DQN.load(path, env_energy_saving)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Evaluate model

In [101]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward:100.60 +/- 0.80


In [102]:
episodes = 5

for episode in range(1, episodes+1):
    obs, _ = env_energy_saving.reset()
    done = False
    score = 0

    while not done:
        env_energy_saving.render()
        action = energy_model.predict(obs)
        obs, reward, done, truncated, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))
    #env.close()

Episode: 1 Score 101
Episode: 2 Score 101
Episode: 3 Score 101
Episode: 4 Score -101
Episode: 5 Score -101


# Viewing logs in Tensorboard

* **Average Reward:** Indicates how well the model perform in the particular enviroment.
* **Average Episode Length:** Indicates how long the agent lost in the particular enviroment.

In [103]:
training_log_path = os.path.join(log_path, 'DQN_1')

In [104]:
!tensorboard --logdir={training_log_path}

^C
