In [98]:
import gym
from gym import Env # env superclass
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

import numpy as np
import random
import os
import warnings

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## Space Types

In [99]:
warnings.filterwarnings("ignore")

In [100]:
Discrete(3).sample() # exmaple of discrete spaces, here we have 0 1 2, you can map to each action

0

In [101]:
Box(low=0, high=1, shape=(3,)).sample() # 3x1
Box(low=0, high=1, shape=(3,3)).sample() # ex for continuous variables, 3x3

array([[0.25859082, 0.1462454 , 0.43991354],
       [0.89556605, 0.7572077 , 0.5663108 ],
       [0.58212054, 0.19536789, 0.6777912 ]], dtype=float32)

In [102]:
Tuple((Discrete(3), Box(0,1,(3,)))).sample # combined

<bound method Tuple.sample of Tuple(Discrete(3), Box(0.0, 1.0, (3,), float32))>

In [103]:
Dict({
    'height':Discrete(2),
    'speed':Box(0,100,shape=(1,)),
    'color':MultiDiscrete([255,255,255])
    }).sample()

OrderedDict([('color', array([ 80,  11, 233], dtype=int64)),
             ('height', 0),
             ('speed', array([33.887768], dtype=float32))])

In [104]:
MultiBinary(12).sample() # Multiple binary values (0, 1)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1], dtype=int8)

In [105]:
MultiDiscrete([100000, 1000, 10]).sample() # Multiple discrete values

array([91440,   532,     6], dtype=int64)

## Custom Env
- Agent to give best shower (temp) possible
- Temp is going to fluctuate (people in building)
- Optimal temp is 37-39 degrees (it needs to learn to get to this conclusion)

In [106]:
from gym import Env

class Skeleton(Env):
    
    def __init__(self):
        self.action_space = None
        self.observation_space = None
        
    def step(self, action):
        pass
    
    def render(self):
        pass
    
    def reset(self):
        pass

In [107]:
from gym import Env

class ShowerEnv(Env):
    
    def __init__(self):
        # actions
        self.action_space = Discrete(3) # Tap up, down, or hold
        
        # happening
        self.observation_space = Box(low=np.array([0]), high=np.array([100])) # Observation space is between temps of 0-100
        
        # initial state 38 +- 3, agent has to get range of 37-39
        self.state = 38 + random.randint(-3,3)
        
        # episode lenght, 60 sec shower, every action -1
        self.shower_lenght = 60
        
    def step(self, action):
        # Apply temp object, action passed in changes the space
        self.state += action-1
        
        # Shower time decrease
        self.shower_lenght -= 1
        
        # Culculate Reward
        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = -1
            
        if self.shower_lenght <= 0:
            terminated = True
        else:
            terminated = False
            
        info = {}
        
        return self.state, reward, terminated, info # returns: observation, reward, terminated and info        
    
    def render(self):
        pass # we can leave it for now
    
    def reset(self):
        self.state = np.array([38+random.randint(-3,3)]).astype(float)
        self.shower_lenght = 60
        return self.state # return observation

In [108]:
env = ShowerEnv()

In [109]:
env.action_space

Discrete(3)

In [110]:
env.observation_space.sample()

array([58.742447], dtype=float32)

## EnvTest

In [111]:
for episode in range(5):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        action = env.action_space.sample() # random random sample
        obs, reward, done, info = env.step(action)
        
        score += reward
        
    print(f'Ep: {episode}, Score" {score}')
    
    env.close()

Ep: 0, Score" -52
Ep: 1, Score" -38
Ep: 2, Score" -60
Ep: 3, Score" -42
Ep: 4, Score" -32


In [112]:
env.close()

In [113]:
env??

[1;31mType:[0m            ShowerEnv
[1;31mString form:[0m     <ShowerEnv instance>
[1;31mDocstring:[0m       <no docstring>
[1;31mClass docstring:[0m
The main OpenAI Gym class.

It encapsulates an environment with arbitrary behind-the-scenes dynamics.
An environment can be partially or fully observed.

The main API methods that users of this class need to know are:

- :meth:`step` - Takes a step in the environment using an action returning the next observation, reward,
  if the environment terminated and observation information.
- :meth:`reset` - Resets the environment to an initial state, returning the initial observation and observation information.
- :meth:`render` - Renders the environment observation with modes depending on the output
- :meth:`close` - Closes the environment, important for rendering where pygame is imported

And set the following attributes:

- :attr:`action_space` - The Space object corresponding to valid actions
- :attr:`observation_space` - The Space ob

## Train Model

In [114]:
log_path = 'logs'

In [116]:
model = PPO('MlpPolicy', env, tensorboard_log=log_path, device='cuda', verbose=1)
model.learn(200_000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -26.3    |
| time/              |          |
|    fps             | 738      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -26.3        |
| time/                   |              |
|    fps                  | 588          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0083359955 |
|    clip_fraction        | 0.0239       |
|    clip_range           |

<stable_baselines3.ppo.ppo.PPO at 0x21f92dda870>

In [117]:
model.save('models\\showe_model_PPO')
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(-36.0, 48.0)