In [1]:
import os
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import random
import numpy as np


from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# types of spaces
- there are 4 key spaces: discrete, box, multi-binary, miulti-discrete
- 2 wrapper spaces: tuple(not supported by stable-baselines), dict

In [9]:
Discrete(3).sample()

0

In [10]:
Box(0, 1, shape=(3,3)).sample()

array([[0.15012637, 0.86877924, 0.18370081],
       [0.4061565 , 0.62263924, 0.9744529 ],
       [0.89537174, 0.72817314, 0.8255071 ]], dtype=float32)

In [14]:
Tuple((Discrete(3), Box(0, 1, shape=(3,)))).sample()

(1, array([0.84669524, 0.3917245 , 0.8238068 ], dtype=float32))

In [17]:
Dict({'height': Discrete(3), 'speed': Box(0, 1, shape=(1,))}).sample()

OrderedDict([('height', 1), ('speed', array([0.88877785], dtype=float32))])

In [19]:
MultiBinary(4).sample()

array([1, 0, 1, 1], dtype=int8)

In [24]:
MultiDiscrete([5,2,2,5]).sample()

array([2, 1, 0, 3], dtype=int64)

# building an env
- build agent to give best shower
- temp bw 37 and 39 degree

In [49]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)          #here 3 is passed to denote 3 conditions: turn tap on, off, unchanged
        self.observation_space = Box(low= 0, high= 0, shape=(1,))
        self.state = 38 + random.randint(-3,3)    # temp can start from 38 +/- 3
        self.shower_length = 60                   #time is passed as 60 seconds
        
        
    def step(self, action):
        self.state += action-1
        self.shower_length -= 1
        
        #calculate reward
        if self.state >=37 and self.state<=39:
            reward = 1
        else:
            reward = -1
            
            
        if self.shower_length <= 0:
            done = True
        else:
            done = False
            
        
        info={}
        return self.state, reward, done, info
    
    
    def render(self):
        #implement viz
        pass
    
    
    def reset(self):
        self.state = np.array([38 + random.randint(-3,3)]).astype(float)
        self.shower_length = 60
        return self.state

In [50]:
env = ShowerEnv()

In [51]:
env.observation_space.sample()

array([0.], dtype=float32)

In [52]:
env.action_space.sample()

0

In [53]:
env.reset()

array([37.])

In [54]:
# explanaiton of self.state += action-1

# action space is Discrete(3)
# which gives either 0,1,2
# therfore, action-1 gives -1,0,1
# -1 reduces shower temp
# 0 keeps it same
# 1 increases temp

# test environment

In [57]:
episodes = 5
for episode in range(1, episodes+1):
    done = False
    score = 0
    state = env.reset()

    while not done:
        env.render()
        action = env.action_space.sample()
        #next_state, reward, terminated, truncated , info = env.step(action)
        n_state, reward, done, info = env.step(action)
        score += reward
        print('Episode: {} Score: {}'.format(episode, score))
env.close()

Episode: 1 Score: 1
Episode: 1 Score: 2
Episode: 1 Score: 1
Episode: 1 Score: 0
Episode: 1 Score: -1
Episode: 1 Score: -2
Episode: 1 Score: -3
Episode: 1 Score: -4
Episode: 1 Score: -5
Episode: 1 Score: -6
Episode: 1 Score: -7
Episode: 1 Score: -8
Episode: 1 Score: -9
Episode: 1 Score: -10
Episode: 1 Score: -11
Episode: 1 Score: -12
Episode: 1 Score: -13
Episode: 1 Score: -14
Episode: 1 Score: -15
Episode: 1 Score: -16
Episode: 1 Score: -17
Episode: 1 Score: -18
Episode: 1 Score: -19
Episode: 1 Score: -20
Episode: 1 Score: -21
Episode: 1 Score: -22
Episode: 1 Score: -23
Episode: 1 Score: -24
Episode: 1 Score: -25
Episode: 1 Score: -26
Episode: 1 Score: -27
Episode: 1 Score: -28
Episode: 1 Score: -29
Episode: 1 Score: -30
Episode: 1 Score: -31
Episode: 1 Score: -32
Episode: 1 Score: -33
Episode: 1 Score: -34
Episode: 1 Score: -35
Episode: 1 Score: -36
Episode: 1 Score: -37
Episode: 1 Score: -38
Episode: 1 Score: -39
Episode: 1 Score: -40
Episode: 1 Score: -41
Episode: 1 Score: -42
Episo

# train model

In [58]:
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', env, verbose =1, tensorboard_log= log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [81]:
model.learn(total_timesteps = 200000)

Logging to Training\Logs\PPO_19
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -5.41    |
| time/              |          |
|    fps             | 1524     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -1.53        |
| time/                   |              |
|    fps                  | 1012         |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0011467723 |
|    clip_fraction        | 0.00474      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.351       |
|    explained_variance   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -3.92        |
| time/                   |              |
|    fps                  | 754          |
|    iterations           | 11           |
|    time_elapsed         | 29           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0012039059 |
|    clip_fraction        | 0.0147       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.288       |
|    explained_variance   | -0.00646     |
|    learning_rate        | 0.0003       |
|    loss                 | 55.4         |
|    n_updates            | 480          |
|    policy_gradient_loss | 0.000112     |
|    value_loss           | 122          |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -0.68         |
| time/                   |               |
|    fps                  | 773           |
|    iterations           | 20            |
|    time_elapsed         | 52            |
|    total_timesteps      | 40960         |
| train/                  |               |
|    approx_kl            | 0.00040709163 |
|    clip_fraction        | 0.00459       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.226        |
|    explained_variance   | 0.0948        |
|    learning_rate        | 0.0003        |
|    loss                 | 67.7          |
|    n_updates            | 570           |
|    policy_gradient_loss | -0.000221     |
|    value_loss           | 125           |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -5.4          |
| time/                   |               |
|    fps                  | 799           |
|    iterations           | 29            |
|    time_elapsed         | 74            |
|    total_timesteps      | 59392         |
| train/                  |               |
|    approx_kl            | 0.00018223497 |
|    clip_fraction        | 0.00649       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.147        |
|    explained_variance   | 0.29          |
|    learning_rate        | 0.0003        |
|    loss                 | 61.5          |
|    n_updates            | 660           |
|    policy_gradient_loss | 7.34e-05      |
|    value_loss           | 119           |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -12.7         |
| time/                   |               |
|    fps                  | 804           |
|    iterations           | 38            |
|    time_elapsed         | 96            |
|    total_timesteps      | 77824         |
| train/                  |               |
|    approx_kl            | 1.5987665e-05 |
|    clip_fraction        | 0.000781      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0996       |
|    explained_variance   | 0.536         |
|    learning_rate        | 0.0003        |
|    loss                 | 49.5          |
|    n_updates            | 750           |
|    policy_gradient_loss | -1.6e-05      |
|    value_loss           | 118           |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -11           |
| time/                   |               |
|    fps                  | 806           |
|    iterations           | 47            |
|    time_elapsed         | 119           |
|    total_timesteps      | 96256         |
| train/                  |               |
|    approx_kl            | 0.00012377143 |
|    clip_fraction        | 0.00269       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.127        |
|    explained_variance   | 0.629         |
|    learning_rate        | 0.0003        |
|    loss                 | 45.1          |
|    n_updates            | 840           |
|    policy_gradient_loss | -0.000234     |
|    value_loss           | 110           |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -11.7         |
| time/                   |               |
|    fps                  | 722           |
|    iterations           | 57            |
|    time_elapsed         | 161           |
|    total_timesteps      | 116736        |
| train/                  |               |
|    approx_kl            | 0.00019362316 |
|    clip_fraction        | 0.00469       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0755       |
|    explained_variance   | 0.758         |
|    learning_rate        | 0.0003        |
|    loss                 | 77.5          |
|    n_updates            | 940           |
|    policy_gradient_loss | -0.00029      |
|    value_loss           | 107           |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -13.9         |
| time/                   |               |
|    fps                  | 664           |
|    iterations           | 66            |
|    time_elapsed         | 203           |
|    total_timesteps      | 135168        |
| train/                  |               |
|    approx_kl            | 0.00052964705 |
|    clip_fraction        | 0.00811       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.102        |
|    explained_variance   | 0.713         |
|    learning_rate        | 0.0003        |
|    loss                 | 47.7          |
|    n_updates            | 1030          |
|    policy_gradient_loss | -2.78e-05     |
|    value_loss           | 125           |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -15.4         |
| time/                   |               |
|    fps                  | 625           |
|    iterations           | 75            |
|    time_elapsed         | 245           |
|    total_timesteps      | 153600        |
| train/                  |               |
|    approx_kl            | 0.00029985933 |
|    clip_fraction        | 0.00381       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0782       |
|    explained_variance   | 0.811         |
|    learning_rate        | 0.0003        |
|    loss                 | 48.8          |
|    n_updates            | 1120          |
|    policy_gradient_loss | -0.000867     |
|    value_loss           | 103           |
-------------------------------------------
-------------------------------------------
| rollout/                |     

KeyboardInterrupt: 

In [62]:
shower_path = os.path.join('Training', 'Saved Models', 'Shower_model_ppo')

In [63]:
model.save(shower_path)

In [80]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
# [0] indicates reward
# [1] indicates variance

(12.0, 58.787753826796276)