## Importing dependencies

In [1]:
import gym
from gym import Env
from gym.spaces import Box, Dict, Discrete

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

import numpy as np
import random
import os

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

## Types of Spaces

In [2]:
discrete_space = Discrete(3)            # Takes one of three different values (0, 1, 2)
discrete_space.sample()

1

In [3]:
box_space = Box(0, 1, shape=(2, 2))                 # Floating values between 0 and 1 in the shape of 2 × 2 matrix
box_space.sample()

array([[0.07707087, 0.341452  ],
       [0.5822266 , 0.03030989]], dtype=float32)

In [4]:
params = {'height': Discrete(2), 'speed': Box(0, 100, shape=(1,))}
dict_space = Dict(params)
dict_space.sample()

OrderedDict([('height', 1), ('speed', array([0.1801315], dtype=float32))])

## Building an Environment

- Build an agent to give us the best shower possible
- Temperature fluctuates randomly
- Ideal temperature: 37 to 39 °C

In [5]:
class ShowerTempEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)                 # Turn the tap Up, Down or Hold
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        self.state = 38 + random.randint(-3, 3)
        self.shower_length = 60

    def reset(self):
        self.state = np.array([38 + random.randint(-3, 3)]).astype(float)
        self.shower_length = 60
        return self.state

    def step(self, action):
        self.state += action - 1
        self.shower_length -= 1

        if 37 <= self.state <= 39:
            reward = 1
        else:
            reward = -1

        if self.shower_length <= 0:
            done = True
        else:
            done = False
        
        info = {}

        return self.state, reward, done, info

## Testing the Environment

In [8]:
env = ShowerTempEnv()
print(f"Action Space: {env.action_space}")
print(f"Observation Space: {env.observation_space}")

Action Space: Discrete(3)
Observation Space: Box(0.0, 100.0, (1,), float32)


In [12]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env.reset()
    score = 0
    done = False

    while not done:
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward

    print(f"Episode: {episode} | Score: {score}")

Episode: 1 | Score: 22
Episode: 2 | Score: -60
Episode: 3 | Score: 42
Episode: 4 | Score: -50
Episode: 5 | Score: -60


## Train Model

In [13]:
log_path = os.path.join('training', 'logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [21]:
model.learn(total_timesteps=500000)

Logging to training\logs\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | 27       |
| time/              |          |
|    fps             | 1203     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | 24.6        |
| time/                   |             |
|    fps                  | 880         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016455702 |
|    clip_fraction        | 0.0662      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.482      |
|    explained_variance   | -0.178      |

<stable_baselines3.ppo.ppo.PPO at 0x236ebafec20>

In [28]:
evaluate_policy(model, env, n_eval_episodes=10)

(24.0, 54.99090833947008)

In [30]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    score = 0
    done = False

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward

    print(f"Episode: {episode} | Score: {score}")

Episode: 1 | Score: 60
Episode: 2 | Score: 28
Episode: 3 | Score: -60
Episode: 4 | Score: -60
Episode: 5 | Score: 60


## Save Model

In [31]:
save_path = os.path.join('training', 'saved_models', 'PPO_500k_ShowerTemp')
model.save(save_path)