In [165]:
#https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering

# 1. Import Dependencies

In [166]:
!pip install gym
!pip install stable_baselines3[extra]



In [167]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Types of Spaces

# 3. Building an Environment

In [168]:
class ScenarioEnv(Env):
    def __init__(self):
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = Discrete(7)
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = MultiDiscrete([100, 3, 3, 3, 3, 3, 3, 14, 14, 14, 14, 14, 14])
        # Set start state
        self.state = np.array([14, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64)

    def step(self, action):
        done = False
        numberOfTargets = 6
        targetsOffset = 1
        # Apply action
        if action == 0:
            reward = 0
        elif action > 0 and action <= numberOfTargets:
            target = action - 1
            reward = 1
            expectedDamageIndex = targetsOffset + (target)
            currentDamageIndex = targetsOffset + (target + numberOfTargets)
            self.state[currentDamageIndex] += 1
            if self.state[currentDamageIndex] > self.state[expectedDamageIndex]:
                reward = -5
        else:
            reward = 0
        
        shouldReward = True
        for myTarget in range(1, numberOfTargets):
            expectedDamageIndex = myTarget
            currentDamageIndex = myTarget + numberOfTargets
            if self.state[currentDamageIndex] != self.state[expectedDamageIndex]:
                shouldReward = False
                break
        
        if shouldReward:
            reward = 100
            done = True

        # Reduce shower length by 1 second
        self.state[0] -= 1

        # Check if shower is done
        if self.state[0] <= 0:
            done = True

        # Apply temperature noise
        # self.state += random.randint(-1,1)
        # Set placeholder for info
        info = {}

        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = np.array([14, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64)
        return self.state


In [169]:
env=ScenarioEnv()

In [170]:
env.observation_space
env.observation_space.contains(env.reset())
env.step(5)

(array([13,  2,  1,  1,  1,  0,  0,  0,  0,  0,  0,  1,  0], dtype=int64),
 -5,
 False,
 {})

In [171]:
env.reset()

array([14,  2,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int64)

In [172]:
from stable_baselines3.common.env_checker import check_env

In [173]:
check_env(env, warn=True)

# 4. Test Environment

In [174]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-36
Episode:2 Score:-25
Episode:3 Score:-35
Episode:4 Score:-42
Episode:5 Score:-31


In [175]:
env.close()

# 5. Train Model

In [176]:
log_path = os.path.join('Training', 'Logs')

In [177]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [178]:
model.learn(total_timesteps=50000)

Logging to Training\Logs\PPO_15
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 13.8     |
|    ep_rew_mean     | -31.8    |
| time/              |          |
|    fps             | 894      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 13.8        |
|    ep_rew_mean          | -26         |
| time/                   |             |
|    fps                  | 520         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012920099 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.94       |
|    explained_variance   | 0.00463     

<stable_baselines3.ppo.ppo.PPO at 0x28c6e61c400>

# 6. Save Model

In [179]:
model.save('PPO')

In [180]:
evaluate_policy(model, env, n_eval_episodes=1000, render=False)



(104.0, 0.0)

In [208]:
episodes = 1
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score+=reward
        print('Episode:{} Score:{} Action:{} State:{}'.format(episode, score, action, obs))
env.close()

Episode:1 Score:1 Action:2 State:[13  2  1  1  1  0  0  0  1  0  0  0  0]
Episode:1 Score:2 Action:3 State:[12  2  1  1  1  0  0  0  1  1  0  0  0]
Episode:1 Score:3 Action:1 State:[11  2  1  1  1  0  0  1  1  1  0  0  0]
Episode:1 Score:4 Action:1 State:[10  2  1  1  1  0  0  2  1  1  0  0  0]
Episode:1 Score:104 Action:4 State:[9 2 1 1 1 0 0 2 1 1 1 0 0]
