1. Import Dependencies

In [230]:
!pip install gym
!pip install stable_baselines3












In [231]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

# Building an Environment

In [232]:
class ScenarioManager:
    def getActionSpace(self):
        return Discrete(7)
    def getObservationSpace(self):
        return Dict({"missles": Discrete(100), "expectedShipDamage": MultiDiscrete([3,3,3,3,3,3]), "currentShipDamage": MultiDiscrete([3,3,3,3,3,3])})
    def getRandomizedState(self):
        return {"missles": random.randint(1, 99),
         "expectedShipDamage": np.array([random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2)]), 
         "currentShipDamage": np.array([0,0,0,0,0,0])}
    def getState(self, numberOfMissles, tD1, tD2, tD3, tD4, tD5, tD6):
        return {"missles": numberOfMissles,
         "expectedShipDamage": np.array([tD1, tD2, tD3, tD4, tD5, tD6]), 
         "currentShipDamage": np.array([0,0,0,0,0,0])}
    def step(self, state, action):
        reward = 0
        done = False
        numberOfTargets = 6
        targetsOffset = 1
        # Apply action
        if action == 0:
            # Should doing nothing have a penalty?
            reward = -10
        elif action > 0 and action <= numberOfTargets:
            # Should we reward here or after hit or even after checking against expected damage?
            shipIndex = action - targetsOffset

            # Reducing reward because overhitting target.
            # Might be a place to improve on once we look into defensive ships and attacking jets
            if state["currentShipDamage"][shipIndex] >= state["expectedShipDamage"][shipIndex]:
                reward = -50
            else:
                reward = 10
            
            roll = random.randint(0, 100)
            if roll <= 60:
                # Should we reward more here?
                state["currentShipDamage"][shipIndex] = max(1, state["expectedShipDamage"][shipIndex])
            
                roll = random.randint(0, 100)
                if roll <= 50:
                    # Should we reward more here?
                    state["currentShipDamage"][shipIndex] = max(2, state["expectedShipDamage"][shipIndex])
            
        else:
            # Should doing nothing have a penalty?
            reward = -10
        
        shouldReward = True
        for shipIndex in range(0, numberOfTargets):
            if state["currentShipDamage"][shipIndex] < state["expectedShipDamage"][shipIndex]:
                shouldReward = False
                break
        
        if shouldReward:
            reward = 100
            done = True

        # Reduce shower length by 1 second
        state["missles"] -= 1

        # Check if shower is done
        if state["missles"] <= 0:
            done = True

        # Set placeholder for info
        info = {}

        # Return step information
        return state, reward, done, info

class ScenarioEnv(Env):
    def __init__(self, numberOfMissles, tD1, tD2, tD3, tD4, tD5, tD6):
        manager = ScenarioManager()
        self.manager = manager
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = manager.getActionSpace()
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = manager.getObservationSpace()
        # store initial state
        self.numberOfMissles = numberOfMissles
        self.tD1 = tD1
        self.tD2 = tD2
        self.tD3 = tD3
        self.tD4 = tD4
        self.tD5 = tD5
        self.tD6 = tD6
        # Set start state
        self.state = self.manager.getState(self.numberOfMissles, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6)

    def step(self, action):
        # Return step information
        return self.manager.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = self.manager.getState(self.numberOfMissles, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6)
        return self.state

class TrainingScenarioEnv(Env):
    def __init__(self):
        manager = ScenarioManager()
        self.manager = manager
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = manager.getActionSpace()
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = manager.getObservationSpace()
        # Set start state
        self.state = manager.getRandomizedState()

    def step(self, action):
        # Return step information
        return self.manager.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = self.manager.getRandomizedState()
        return self.state


Check Training Environment

In [233]:
trainingEnv=TrainingScenarioEnv()
check_env(trainingEnv, warn=True)

In [234]:
manager = ScenarioManager()
manager.step(trainingEnv.state, 0)

({'missles': 26,
  'expectedShipDamage': array([1, 0, 1, 0, 2, 2]),
  'currentShipDamage': array([0, 0, 0, 0, 0, 2])},
 -10,
 False,
 {})

Check Scenario Environment

In [235]:
scenarioEnv = ScenarioEnv(14, 2, 1, 1, 1, 0, 0)
check_env(scenarioEnv, warn=True)

# Train Model

In [236]:
log_path = os.path.join('Training', 'Logs')

In [237]:
trainingEnv.reset()
model = PPO("MultiInputPolicy", trainingEnv, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [238]:
model.learn(total_timesteps=200000)

Logging to Training\Logs\PPO_20
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.1     |
|    ep_rew_mean     | -470     |
| time/              |          |
|    fps             | 2186     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 18.6         |
|    ep_rew_mean          | -405         |
| time/                   |              |
|    fps                  | 1527         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0022070825 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.95        |
|    explained_variance   

<stable_baselines3.ppo.ppo.PPO at 0x28d865ff6d0>

# Save Model

In [239]:
model.save('PPO')

# Evaluate Model

In [240]:
trainingEnv.reset()
evaluate_policy(model, trainingEnv, n_eval_episodes=1000, render=False)

(149.18, 37.94374256712166)

In [241]:
randomSampleEnv = TrainingScenarioEnv()
randomSampleEnvObs = randomSampleEnv.reset()
done = False
score = 0 

while not done:
    action, _ = model.predict(randomSampleEnvObs)
    randomSampleEnvObs, reward, done, info = randomSampleEnv.step(action)
    score+=reward
    print('Score:{} Action:{} State:{}'.format(score, action, randomSampleEnvObs))
randomSampleEnv.close()

Score:10 Action:1 State:{'missles': 6, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:20 Action:1 State:{'missles': 5, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:30 Action:4 State:{'missles': 4, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:40 Action:4 State:{'missles': 3, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:50 Action:4 State:{'missles': 2, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:60 Action:1 State:{'missles': 1, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:70 Action:4 State:{'missles': 0, 'expectedShipDamage': array([2, 0, 0, 1, 2, 0]), 'currentShipDamage': array([0, 0, 0, 2, 0, 0])}


In [249]:
scenarioEnv.reset()
evaluate_policy(model, scenarioEnv, n_eval_episodes=1000, render=False)

(155.38, 19.643207477395336)

In [246]:
sampleEnvObs = scenarioEnv.reset()
done = False
score = 0 

while not done:
    action, _ = model.predict(sampleEnvObs)
    sampleEnvObs, reward, done, info = scenarioEnv.step(action)
    score+=reward
    print('Score:{} Action:{} State:{}'.format(score, action, sampleEnvObs))
scenarioEnv.close()

Score:10 Action:2 State:{'missles': 13, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:20 Action:1 State:{'missles': 12, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([0, 0, 0, 0, 0, 0])}
Score:30 Action:1 State:{'missles': 11, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([2, 0, 0, 0, 0, 0])}
Score:40 Action:2 State:{'missles': 10, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([2, 2, 0, 0, 0, 0])}
Score:50 Action:4 State:{'missles': 9, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([2, 2, 0, 1, 0, 0])}
Score:150 Action:3 State:{'missles': 8, 'expectedShipDamage': array([2, 1, 1, 1, 0, 0]), 'currentShipDamage': array([2, 2, 1, 1, 0, 0])}
