In [1]:
import gymnasium as gym
import vizdoom
import numpy as np
import cv2
import stable_baselines3 as sb3
import matplotlib.pyplot as plt
import os
import time

import sb3_contrib

import optuna
import torch

  from .autonotebook import tqdm as notebook_tqdm


SETTINGS

In [2]:
DEFAULT_SCENARIO_PATH = r"vizdoom\scenarios\deadly_corridor_custom_skill1.cfg"

# Height and width of the resized image
# IMAGE_SHAPE = (60, 80)

# Training parameters
TRAINING_TIMESTEPS = int(100e3)
N_STEPS = 128
N_ENVS = 1
FRAME_SKIP = 4

CHECKPOINT_DIR = './train/train_corridor2'
LOG_DIR = './logs/log_corridor2'

In [16]:
class VizDoomEnv(gym.Env):
    def __init__(self, config_path, render=False):
        super().__init__()

        # Setup the Doom game environment
        self.game = vizdoom.DoomGame()
        self.game.load_config(config_path)
        self.num_actions = 7  # Number of actions in the game (depends on scenario)

        if render:
            self.game.set_window_visible(True)
        else:
            self.game.set_window_visible(False)
        
        self.game.init()

        # Create action space and observation space
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(100, 160, 1),
            dtype=np.uint8
        )

        # Initialize previous values for shaping reward 
        self.selected_weapon_ammo_available_per_episode = self.game.get_state().game_variables[3]
        self.previous_selected_weapon_ammo = -1

        self.previous_killcount = 0
        self.previous_hits_taken = 0

        # Initialize reward shaping weights
        self.game_reward_weight = 0.01
        self.delta_selected_weapon_ammo_weight = 0.25
        self.delta_hits_taken_weight = -0.5
        self.delta_killcount_weight = 5

    def step(self, action):

        actions = np.eye(self.num_actions, dtype=np.uint8)
        game_reward = self.game.make_action(actions[action], FRAME_SKIP)  # FRAME_SKIP ticks per action
        terminated = self.game.is_episode_finished()

        state = self.game.get_state()
        if state is not None:
            observation = self.simplify_observation(state.screen_buffer)
            health, killcount, hits_taken, selected_weapon_ammo = state.game_variables

            # Calculate deltas for shaping reward
            if self.previous_selected_weapon_ammo == -1:
                self.previous_selected_weapon_ammo = selected_weapon_ammo
            delta_selected_weapon_ammo = selected_weapon_ammo - self.previous_selected_weapon_ammo
            delta_killcount = killcount - self.previous_killcount
            delta_hits_taken = hits_taken - self.previous_hits_taken

            # Update previous values
            self.previous_selected_weapon_ammo = selected_weapon_ammo
            self.previous_killcount = killcount
            self.previous_hits_taken = hits_taken

            # Calculate the shaped reward
            shaped_reward = (
                self.game_reward_weight * game_reward
                + self.delta_selected_weapon_ammo_weight * delta_selected_weapon_ammo
                + self.delta_killcount_weight * delta_killcount
            )

            # print(shaped_reward)

            # print(f"Shaped_reward: {shaped_reward}, Killcount: {0}, Hits Taken: {delta_hits_taken}, Selected Weapon Ammo: {delta_selected_weapon_ammo}")

            info = {"info": 0}
        else:
            observation = np.zeros(self.observation_space.shape, dtype=np.uint8)
            shaped_reward = 0
            info = {"info": 0}

        return observation, shaped_reward, terminated, False, info
    
    def reset(self, seed=None):
        self.game.new_episode()
        state = self.game.get_state()
        observation = self.simplify_observation(state.screen_buffer)
        # health, killcount, hits_taken, selected_weapon_ammo = state.game_variables
        info = {"info": 0}

        self.previous_selected_weapon_ammo = -1
        self.previous_killcount = 0
        self.previous_hits_taken = 0

        return observation, info
    
    def close(self):
        self.game.close()

    def simplify_observation(self, observation):
        # # Convert the observation to grayscale and resize it
        gray_observation = cv2.cvtColor(np.moveaxis(observation,0,-1), cv2.COLOR_BGR2GRAY)
        cropped_obervation = gray_observation[:100, :]
        resized_observation = cv2.resize(cropped_obervation, None, fx=1, fy=1, interpolation=cv2.INTER_AREA)
        simplified_observation = np.expand_dims(resized_observation, axis=-1)  # Add channel dimension
        
        # COLOR
        # cropped_obervation = np.moveaxis(observation,0,-1)[:100, :, :]
        # simplified_observation = cropped_obervation

        return simplified_observation



In [4]:
%matplotlib qt
# %matplotlib inline
env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)
observation = env.reset()[0]
plt.imshow(cv2.cvtColor(observation, cv2.COLOR_BGR2RGB))
env.close()

In [5]:
from stable_baselines3.common import env_checker
# Check validity of the environment
env = VizDoomEnv(DEFAULT_SCENARIO_PATH)
sb3.common.env_checker.check_env(env)
env.close()

Callback (saving)

In [6]:
log_name = 'RPPO_SR1_ns1024_bs12_V3'

In [7]:
class TrainAndLoggingCallback(sb3.common.callbacks.BaseCallback):

    def __init__(self, check_freq, save_path, log_name="unkown", verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        self.log_name = log_name

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, '{}_best_model_{}'.format(self.log_name, self.n_calls))
            self.model.save(model_path)

        return True
    
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR, log_name=log_name)

Train Model

In [8]:
# Non rendered environment
envs = sb3.common.env_util.make_vec_env(VizDoomEnv, n_envs=N_ENVS, env_kwargs = {'config_path': DEFAULT_SCENARIO_PATH})

In [9]:
# New fance pancy agent?
lr_fun = lambda f: f * 2.5e-4
lr_fun = lambda f: 2.5e-4 * np.exp(3*(f - 1))
cr_fun = lambda f: 0.1    * np.exp(0.1*(f - 1))
# Batch size seems important
agent = sb3_contrib.RecurrentPPO('CnnLstmPolicy', envs, verbose=1, tensorboard_log=LOG_DIR, n_steps=1024, batch_size=12, gae_lambda=0.95, gamma=0.99, n_epochs=10, ent_coef=0.01, learning_rate=lr_fun, clip_range=cr_fun)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [25]:
# Reload model from disc
agent = sb3_contrib.RecurrentPPO.load(r'.\train\train_corridor2\RPPO_SR1_ns1024_bs12_V3skill5_V2_2.zip', env=envs)

Wrapping the env in a VecTransposeImage.


In [None]:
# Model initialization
# model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-5, n_steps=2048)
# model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-4, n_steps=2048*4)
# agent = sb3.PPO("CnnPolicy", envs, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-3, n_steps=N_STEPS)
# model.learning_rate


In [19]:
envs = sb3.common.env_util.make_vec_env(VizDoomEnv, n_envs=N_ENVS, env_kwargs = {'config_path': r"vizdoom\scenarios\deadly_corridor_custom_skill5.cfg"})
agent.set_env(envs)
agent.learning_rate = lambda f: 2.5e-4 * np.exp(3*(f - 1))

Wrapping the env in a VecTransposeImage.


In [20]:
torch.set_num_threads(1)
agent.learn(total_timesteps=TRAINING_TIMESTEPS, callback=callback, tb_log_name=log_name)
agent.save(os.path.join(CHECKPOINT_DIR, log_name))

Logging to ./logs/log_corridor2\RPPO_SR1_ns1024_bs12_V3_10


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 185      |
|    ep_rew_mean     | 17.2     |
| time/              |          |
|    fps             | 89       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 198        |
|    ep_rew_mean          | 24.5       |
| time/                   |            |
|    fps                  | 42         |
|    iterations           | 2          |
|    time_elapsed         | 47         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.07193583 |
|    clip_fraction        | 0.739      |
|    clip_range           | 0.0999     |
|    entropy_loss         | -1.61      |
|    explained_variance   | 0.768      |
|    learning_rate        | 0.000242   |
|   

In [21]:
agent.save(os.path.join(CHECKPOINT_DIR, log_name + 'skill5_V2_2'))

In [18]:
envs.close()

In [None]:
DEFAULT_HYPERPARAMS = {
    "policy": "CnnLstmPolicy",
}

Test model

In [None]:
# Reload model from disc
agent = sb3.PPO.load(r'.\train\train_center\RPPO_blogAtariSettings_BW.zip')

In [None]:
env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)

In [None]:
mean_reward, std_reward = sb3.common.evaluation.evaluate_policy(agent, env, n_eval_episodes=10, deterministic=True)
env.close()
mean_reward

In [26]:
test_env = VizDoomEnv(r"vizdoom\scenarios\deadly_corridor_custom_skill5.cfg", render=True)
for episode in range(2): 
    obs = test_env.reset()[0]
    done = False
    state = None
    episode_starts = True

    total_reward = 0
    while not done: 
        action, state = agent.predict(obs, state=state, episode_start=episode_starts)
        obs, reward, done, _, info = test_env.step(action)
        episode_starts = done
        time.sleep(0.1)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

test_env.close()

Total Reward for episode 0 is 25.6020295715332
Total Reward for episode 1 is 39.68237655639649


In [None]:
test_env.close()