In [None]:
#https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering


# 1. Import Dependencies

In [70]:
import gymnasium as gym 
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy


# 2. Types of Spaces

In [71]:
Discrete(3)


Discrete(3)

In [72]:
Box(0,1,shape=(3,3)).sample()


array([[0.9992809 , 0.32839844, 0.7395926 ],
       [0.57921183, 0.6882018 , 0.33960867],
       [0.27412984, 0.9018109 , 0.48158562]], dtype=float32)

In [73]:
Box(0,255,shape=(3,3), dtype=int).sample()


array([[ 27,  11, 175],
       [ 18,  34,  36],
       [181, 234, 188]])

In [74]:
Tuple((Discrete(2), Box(0,100, shape=(1,)))).sample()


(np.int64(0), array([89.51803], dtype=float32))

In [75]:
Dict({'height':Discrete(2), "speed":Box(0,100, shape=(1,))}).sample()


{'height': np.int64(0), 'speed': array([61.501575], dtype=float32)}

In [76]:
MultiBinary(4).sample()


array([1, 1, 1, 1], dtype=int8)

In [77]:
MultiDiscrete([5,2,2]).sample()


array([4, 0, 0])

# 3. Building an Environment

In [78]:
class ShowerEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Temperature array
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start temp
        self.state = 38 + random.randint(-3,3)
        # Set shower length
        self.shower_length = 60
        
    def step(self, action):
        # Apply action
        self.state += action - 1

        # Reduce shower length
        self.shower_length -= 1

        # Reward logic
        reward = 1 if 37 <= self.state <= 39 else -1

        # Episode termination condition
        terminated = self.shower_length <= 0
        truncated = False  # Let Gym handle time truncation if needed externally

        info = {}
        return np.array([self.state], dtype=np.float32), reward, terminated, truncated, info

    def render(self):
        pass
    
    def reset(self, seed=None, options=None):
        # Reset the random seed
        super().reset(seed=seed)
        if seed is not None:
            np.random.seed(seed)
        self.state = 38 + random.randint(-3,3)
        # Reset shower time
        self.shower_length = 60 
        info = {}
        return np.array([self.state], dtype=np.float32), info


In [79]:
env=ShowerEnv()


In [80]:
env.observation_space.sample()


array([21.818592], dtype=float32)

In [81]:
env.reset()


(array([40.], dtype=float32), {})

In [82]:
from stable_baselines3.common.env_checker import check_env


In [83]:
check_env(env, warn=True)


# 4. Test Environment

In [88]:
episodes = 5
for episode in range(1, episodes+1):
    state, _ = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        done = terminated  # Only use terminated for ending episodes
        score += reward
    print(f'Episode:{episode} Score:{score}')


Episode:1 Score:-52
Episode:2 Score:-10
Episode:3 Score:-36
Episode:4 Score:26
Episode:5 Score:-60


In [89]:
env.close()


# 5. Train Model

In [90]:
log_path = os.path.join('Training', 'Logs')


In [91]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [92]:
model.learn(total_timesteps=400000)


Logging to Training\Logs\PPO_11
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -27.8    |
| time/              |          |
|    fps             | 3353     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -28.1       |
| time/                   |             |
|    fps                  | 1622        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008405151 |
|    clip_fraction        | 0.0794      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -6.01e-05   

<stable_baselines3.ppo.ppo.PPO at 0x20ec0d2fc50>

# 6. Save Model

In [93]:
model.save('PPO')


In [96]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)


(np.float64(47.4), np.float64(35.811171441325406))