# 1. Import Dependencies

In [2]:
# Import GYM stuff
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# Import helpers
import numpy as np
import random
import os

# Import Stable baselines stuff
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Types of Spaces

In [4]:
Discrete(3).sample()

2

In [6]:
Box(0,1, shape=(3,3)).sample()

array([[0.84854555, 0.33043265, 0.21097879],
       [0.01186542, 0.7988581 , 0.09130581],
       [0.07418171, 0.11659063, 0.8978787 ]], dtype=float32)

In [10]:
Tuple((Discrete(3), Box(0,1, shape=(3,3)))).sample()

(2,
 array([[0.10430644, 0.3855239 , 0.7692416 ],
        [0.5384359 , 0.7831622 , 0.4466417 ],
        [0.3415042 , 0.56906396, 0.49564177]], dtype=float32))

In [11]:
Dict({'height': Discrete(2), 'speed': Box(0,100,shape=(1,))}).sample()

OrderedDict([('height', 0), ('speed', array([92.85357], dtype=float32))])

In [14]:
MultiBinary(4).sample()

array([1, 0, 1, 1], dtype=int8)

In [15]:
MultiDiscrete([5,2,2]).sample()

array([2, 0, 1], dtype=int64)

# 3. Building an Environment
- Build an agent to give us the best shower possible
- Random temperature
- Best temp 37 to 39 degrees (but agent doesn't know this)

In [31]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3) # turn the tap up, down, or unchanged
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        self.state = 38 + random.randint(-3,3)
        self.shower_length = 60
        
    def step(self, action):
        # Apply temp adj
        self.state += action-1
        
        # Decrease shower time
        self.shower_length-1
        
        # Calculate Reward
        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = -1
            
        
        if self.shower_length <= 0:
            done = True
        else:
            done = False
            
        info = {}
            
        return self.state, reward, done, info
    
    def render(self):
        # Implement visualization
        pass
    def reset(self):
        self.state = np.array([38+random.randint(-3,3)]).astype(float)
        self.shower_length = 60
        return self.state

In [32]:
env = ShowerEnv()

In [19]:
env.observation_space.sample()

array([23.740873], dtype=float32)

In [27]:
env.action_space.sample()

1

# 4. Test Environment

In [33]:
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode,score))
env.close()

KeyboardInterrupt: 

# 5. Train Model

In [None]:
log_path = os.path.join('Training','Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)