In [None]:
!pip install vizdoom
!pip install gym==0.21
!cd github & git clone https://github.com/Farama-Foundation/ViZDoom
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install stable-baselines3[extra]==1.6.0

In [1]:
# Import vizdoom for game env
from vizdoom import *
# Import random for action sampling
import random
# Import time for sleeping
import time
# Import numpy for identity matrix
import numpy as np
# Import environment base class from OpenAI Gym
from gym import Env
# Import gym spaces
from gym.spaces import Discrete, Box 
# Import opencv
import cv2
# Import Environment checker
from stable_baselines3.common import env_checker
# Import os for file nav
import os
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback
# Import PPO for training
from stable_baselines3 import PPO
# Import eval policy to test agent
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
# Create ViZDoom OpenAI Gym environment
class VizDoomGym(Env):
    # Function that is called when whe start the env
    def __init__(self, render=False, config='github/ViZDoom/scenarios/deadly_corridor_s1.cfg' ):
        # Inherit from Env
        super().__init__()
        # Setup the game
        self.game = DoomGame()
        self.game.load_config(config)

        # Render frame Logic
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8)
        self.action_space = Discrete(7)

        # Game variables: HEALTH DAMAGE_TAKEN HITCOUNT SELECTED_WEAPON_AMMO
        self.damage_taken = 0
        self.hitcount = 0
        self.ammo = 52
        
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(7)
        movement_reward = self.game.make_action(actions[action], 4)

        reward = 0
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)

            # Reward shaping
            game_variables = self.game.get_state().game_variables
            health, damage_taken, hitcount, ammo = game_variables

            # Calculate reward deltas
            damage_taken_delta = -damage_taken + self.damage_taken
            self.damage_taken = damage_taken
            hitcount_delta = hitcount - self.hitcount
            self.hitcount = hitcount
            ammo_delta = ammo - self.ammo
            self.ammo = ammo

            reward = movement_reward + damage_taken_delta*10 + hitcount_delta*200 + ammo_delta*5
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
        
    # Define how to render the game or environment
    def render(self):
        pass
        
    # What happens when we start a new game
    def reset(self):
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
        
    # Grayscale the game frame and resize it
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
        
    # Close doen the game
    def close(self):
        self.game.close()

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
                                      
        return True
        
CHECKPOINT_DIR = './train/train_deadly_corridor'
LOG_DIR = './logs/log_deadly_corridor'

# Setup model saving callback
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [3]:
# Non render environment
env = VizDoomGym(config='github/ViZDoom/scenarios/deadly_corridor_s1.cfg')

In [4]:
# Create the model
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.00001, n_steps=8192, clip_range=.1, gamma=.95, gae_lambda=.9)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [5]:
# Train the model
model.learn(total_timesteps=1000000, callback=callback)

Logging to ./logs/log_deadly_corridor\PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 189      |
|    ep_rew_mean     | 119      |
| time/              |          |
|    fps             | 141      |
|    iterations      | 1        |
|    time_elapsed    | 57       |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 190          |
|    ep_rew_mean          | 155          |
| time/                   |              |
|    fps                  | 60           |
|    iterations           | 2            |
|    time_elapsed         | 271          |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0027663442 |
|    clip_fraction        | 0.145        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.94        |
|    explained

KeyboardInterrupt: 

In [13]:
model.load('./train/train_deadly_corridor/best_model_190000.zip')

<stable_baselines3.ppo.ppo.PPO at 0x164b1825430>

In [14]:
# Non render environment
env = VizDoomGym(config='github/ViZDoom/scenarios/deadly_corridor_s2.cfg')
model.set_env(env)
model.learn(total_timesteps=100000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_corridor\PPO_8


KeyboardInterrupt: 

In [None]:
# Non render environment
env = VizDoomGym(config='github/ViZDoom/scenarios/deadly_corridor_s3.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [None]:
# Non render environment
env = VizDoomGym(config='github/ViZDoom/scenarios/deadly_corridor_s4.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [None]:
# Non render environment
env = VizDoomGym(config='github/ViZDoom/scenarios/deadly_corridor_s5.cfg')
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

In [10]:
# Reload from disc
model = PPO.load('./train/train_deadly_corridor/best_model_190000')

In [11]:
# Create a rendered environment
env = VizDoomGym(render=True)

In [None]:
# Evaluate mean reward for 100 games
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100)

In [12]:
# Evaluate the total reward for 10 episodes
for episode in range(10):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.20)
        total_reward += reward
    print('Total reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

Total reward for episode 0 is -404.3245849609375
Total reward for episode 1 is 670.8368530273438
Total reward for episode 2 is 197.3585205078125
Total reward for episode 3 is 433.27430725097656
Total reward for episode 4 is 287.4143524169922
Total reward for episode 5 is 460.1216583251953
Total reward for episode 6 is 623.4118957519531
Total reward for episode 7 is 324.99627685546875


KeyboardInterrupt: 