# 1. Import Dependencies

In [None]:
!pip install gym
!pip install stable_baselines3[extra]

In [None]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Types of Spaces

# 3. Building an Environment

In [None]:
class ScenarioStepper:
    def step(self, state, action):
        done = False
        numberOfTargets = 6
        targetsOffset = 1
        # Apply action
        if action == 0:
            reward = 0
        elif action > 0 and action <= numberOfTargets:
            target = action - 1
            reward = 1
            expectedDamageIndex = targetsOffset + (target)
            currentDamageIndex = targetsOffset + (target + numberOfTargets)
            state[currentDamageIndex] += 1
            if state[currentDamageIndex] > state[expectedDamageIndex]:
                reward = -5
        else:
            reward = 0
        
        shouldReward = True
        for myTarget in range(1, numberOfTargets+1):
            expectedDamageIndex = myTarget
            currentDamageIndex = myTarget + numberOfTargets
            if state[currentDamageIndex] != state[expectedDamageIndex]:
                shouldReward = False
                break
        
        if shouldReward:
            reward = 100
            done = True

        # Reduce shower length by 1 second
        state[0] -= 1

        # Check if shower is done
        if state[0] <= 0:
            done = True

        # Set placeholder for info
        info = {}

        # Return step information
        return state, reward, done, info

class ScenarioEnv(Env):
    def __init__(self, numberOfMissles, tD1, tD2, tD3, tD4, tD5, tD6):
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = Discrete(7)
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = MultiDiscrete([100, 3, 3, 3, 3, 3, 3, 14, 14, 14, 14, 14, 14])
        # store initial state
        self.numberOfMissles = numberOfMissles
        self.tD1 = tD1
        self.tD2 = tD2
        self.tD3 = tD3
        self.tD4 = tD4
        self.tD5 = tD5
        self.tD6 = tD6
        # Set start state
        self.state = np.array([self.numberOfMissles, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6, 0, 0, 0 ,0 ,0 ,0])
        self.stepper = ScenarioStepper()

    def step(self, action):
        # Return step information
        return self.stepper.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = np.array([self.numberOfMissles, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6, 0, 0, 0 ,0 ,0 ,0])
        return self.state

class TrainingScenarioEnv(Env):
    def __init__(self):
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = Discrete(7)
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = MultiDiscrete([100, 3, 3, 3, 3, 3, 3, 14, 14, 14, 14, 14, 14])
        # Set start state
        self.state = np.array([random.randint(1, 15), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), 0, 0, 0 ,0 ,0 ,0])
        self.stepper = ScenarioStepper()

    def step(self, action):
        # Return step information
        return self.stepper.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = np.array([random.randint(1, 15), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), random.randint(0, 2), 0, 0, 0 ,0 ,0 ,0])
        return self.state


In [None]:
env=TrainingScenarioEnv()

In [None]:
env.observation_space
env.observation_space.contains(env.reset())
env.step(5)

In [None]:
env.reset()

In [None]:
from stable_baselines3.common.env_checker import check_env

In [None]:
check_env(env, warn=True)

# 4. Test Environment

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

# 5. Train Model

In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=100000)

# 6. Save Model

In [None]:
model.save('PPO')

In [None]:
evaluate_policy(model, env, n_eval_episodes=1000, render=False)

In [None]:
randomSampleEnv = TrainingScenarioEnv()
randomSampleEnvObs = randomSampleEnv.reset()
done = False
score = 0 

while not done:
    action, _ = model.predict(randomSampleEnvObs)
    randomSampleEnvObs, reward, done, info = randomSampleEnv.step(action)
    score+=reward
    print('Episode:{} Score:{} Action:{} State:{}'.format(episode, score, action, randomSampleEnvObs))
randomSampleEnv.close()

In [None]:
sampleEnv = ScenarioEnv(14, 2, 1, 1, 1, 0, 0)
sampleEnvObs = sampleEnv.reset()
done = False
score = 0 

while not done:
    action, _ = model.predict(sampleEnvObs)
    sampleEnvObs, reward, done, info = sampleEnv.step(action)
    score+=reward
    print('Episode:{} Score:{} Action:{} State:{}'.format(episode, score, action, sampleEnvObs))
sampleEnv.close()