In [1]:
import gymnasium as gym
import vizdoom
import numpy as np
import cv2
import stable_baselines3 as sb3
import matplotlib.pyplot as plt
import os
import time

import sb3_contrib

import optuna
import torch

  from .autonotebook import tqdm as notebook_tqdm


SETTINGS

In [2]:
DEFAULT_SCENARIO_PATH = r"vizdoom\scenarios\defend_the_center_custom.cfg"

# Height and width of the resized image
# IMAGE_SHAPE = (60, 80)

# Training parameters
TRAINING_TIMESTEPS = int(100e3)
N_STEPS = 128
N_ENVS = 1
FRAME_SKIP = 4

CHECKPOINT_DIR = './train/train_center'
LOG_DIR = './logs/log_center'

In [3]:
class VizDoomEnv(gym.Env):
    def __init__(self, config_path, render=False):
        super().__init__()

        # Setup the Doom game environment
        self.game = vizdoom.DoomGame()
        self.game.load_config(config_path)
        self.num_actions = 3  # Number of actions in the game (depends on scenario)

        if render:
            self.game.set_window_visible(True)
        else:
            self.game.set_window_visible(False)
        
        self.game.init()

        # Create action space and observation space
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(100, 160, 1),
            dtype=np.uint8
        )

        # Initialize previous values for shaping reward 
        self.ammo2_available_per_episode = self.game.get_state().game_variables[0]
        self.previous_ammo2 = self.ammo2_available_per_episode

    def step(self, action):

        actions = np.eye(self.num_actions, dtype=np.uint8)
        game_reward = self.game.make_action(actions[action], FRAME_SKIP)  # FRAME_SKIP ticks per action
        terminated = self.game.is_episode_finished()

        state = self.game.get_state()
        if state is not None:
            observation = self.simplify_observation(state.screen_buffer)
            ammo2, health = state.game_variables

            # Calculate deltas for shaping reward
            delta_ammo2 = ammo2 - self.previous_ammo2

            # Update previous values
            self.previous_ammo2 = ammo2

            # Calculate the shaped reward
            shaped_reward = (
                1 * game_reward
                + 0.5 * delta_ammo2
            )

            # print(shaped_reward)

            # print(f"Shaped_reward: {shaped_reward}, Killcount: {killcount}, Hits Taken: {hits_taken}, Selected Weapon Ammo: {selected_weapon_ammo}")

            info = {"info": 0}
        else:
            observation = np.zeros(self.observation_space.shape, dtype=np.uint8)
            shaped_reward = 0
            info = {"info": 0}

        return observation, shaped_reward, terminated, False, info
    
    def reset(self, seed=None):
        self.game.new_episode()
        state = self.game.get_state()
        observation = self.simplify_observation(state.screen_buffer)
        # health, killcount, hits_taken, selected_weapon_ammo = state.game_variables
        info = {"info": 0}

        self.selected_weapon_ammo = self.ammo2_available_per_episode

        return observation, info
    
    def close(self):
        self.game.close()

    def simplify_observation(self, observation):
        # # Convert the observation to grayscale and resize it
        gray_observation = cv2.cvtColor(np.moveaxis(observation,0,-1), cv2.COLOR_BGR2GRAY)
        cropped_obervation = gray_observation[:100, :]
        resized_observation = cv2.resize(cropped_obervation, None, fx=1, fy=1, interpolation=cv2.INTER_AREA)
        simplified_observation = np.expand_dims(resized_observation, axis=-1)  # Add channel dimension
        
        # COLOR
        # cropped_obervation = np.moveaxis(observation,0,-1)[:100, :, :]
        # simplified_observation = cropped_obervation

        return simplified_observation



In [4]:
%matplotlib qt
# %matplotlib inline
env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)
observation = env.reset()[0]
plt.imshow(cv2.cvtColor(observation, cv2.COLOR_BGR2RGB))
env.close()

In [5]:
from stable_baselines3.common import env_checker
# Check validity of the environment
env = VizDoomEnv(DEFAULT_SCENARIO_PATH)
sb3.common.env_checker.check_env(env)
env.close()

Callback (saving)

In [6]:
log_name = 'RPPO_blogAtariSettings_BW_V5'

In [7]:
class TrainAndLoggingCallback(sb3.common.callbacks.BaseCallback):

    def __init__(self, check_freq, save_path, log_name="unkown", verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        self.log_name = log_name

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, '{}_best_model_{}'.format(self.log_name, self.n_calls))
            self.model.save(model_path)

        return True
    
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR, log_name=log_name)

Train Model

In [8]:
# Non rendered environment
envs = sb3.common.env_util.make_vec_env(VizDoomEnv, n_envs=N_ENVS, env_kwargs = {'config_path': DEFAULT_SCENARIO_PATH})

In [9]:
# New fance pancy agent?
lr_fun = lambda f: f * 2.5e-4
lr_fun = lambda f: 2.5e-4 * np.exp(4*(f - 1))
cr_fun = lambda f: f * 0.1
# Batch size seems important
agent = sb3_contrib.RecurrentPPO('CnnLstmPolicy', envs, verbose=1, tensorboard_log=LOG_DIR, n_steps=128, batch_size=12, gae_lambda=0.95, gamma=0.99, n_epochs=4, ent_coef=0.01, learning_rate=lr_fun, clip_range=cr_fun)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [11]:
# Reload model from disc
agent = sb3_contrib.RecurrentPPO.load(r'.\train\train_center\RPPO_blogAtariSettings_BW_V5.zip', env=envs)

Wrapping the env in a VecTransposeImage.


In [None]:
# Model initialization
# model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-5, n_steps=2048)
# model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-4, n_steps=2048*4)
# agent = sb3.PPO("CnnPolicy", envs, verbose=1, tensorboard_log=LOG_DIR, learning_rate=1e-3, n_steps=N_STEPS)
# model.learning_rate


In [10]:
torch.set_num_threads(1)
agent.learn(total_timesteps=TRAINING_TIMESTEPS, callback=callback, tb_log_name=log_name)
agent.save(os.path.join(CHECKPOINT_DIR, log_name))

Logging to ./logs/log_center\RPPO_blogAtariSettings_BW_V5_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93       |
|    ep_rew_mean     | -8       |
| time/              |          |
|    fps             | 78       |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 128      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -3.25        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 0.0035013452 |
|    clip_fraction        | 0.245        |
|    clip_range           | 0.0999       |
|    entropy_loss         | -1.1         

In [None]:
agent.save(os.path.join(CHECKPOINT_DIR, log_name))

In [None]:
envs.close()

In [None]:
DEFAULT_HYPERPARAMS = {
    "policy": "CnnLstmPolicy",
}

In [None]:
from typing import Dict, Any
def sampleRPPO_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for RPPO hyperparameters."""
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 4, 12, log=True)
    gamma = 1 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    gae_lambda = 1 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.3, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)

    trial.set_user_attr("learning_rate", learning_rate)
    trial.set_user_attr("n_steps", n_steps)
    trial.set_user_attr("gamma", gamma)
    trial.set_user_attr("gae_lambda", gae_lambda)
    trial.set_user_attr("clip_range", clip_range)
    trial.set_user_attr("ent_coef", ent_coef)
    trial.set_user_attr("max_grad_norm", max_grad_norm)

    return {
        "learning_rate": learning_rate,
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "clip_range": clip_range,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
    }

class TrailEvalCallback(sb3.common.callbacks.EvalCallback):
    """Callback for training and evaluation of the agent."""

    def __init__(self, eval_env, trail, n_eval_episodes = 5, eval_freq=10000, deterministic = True, verbose=0):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trail = trail
        self.eval_idx = 0
        self.is_pruned = False
        
    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trail.report(self.last_mean_reward, self.eval_idx)
            if self.trail.should_prune():
                self.is_pruned = True
                return False
        return True
    
def objective(trial:optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sampleRPPO_params(trial))
    model = sb3_contrib.RecurrentPPO(**kwargs, env=VizDoomEnv(DEFAULT_SCENARIO_PATH, render=False))
    eval_env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)
    eval_callback = TrailEvalCallback(eval_env, trial, n_eval_episodes=5, eval_freq=10000, deterministic=True)
    nan_encountered = False
    try:
        model.learn(total_timesteps=TRAINING_TIMESTEPS, callback=eval_callback, tb_log_name="RPPO_optuna")
    except AssertionError as e:
        print(e)
        nan_encountered = True
    finally:
        model.env.close()
        eval_env.close()

    if nan_encountered:
        return float("nan")
    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()
    
    return eval_callback.last_mean_reward


In [None]:
torch.set_num_threads(1)
sampler = optuna.samplers.TPESampler(n_startup_trials=5)
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1)
study = optuna.create_study(
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    study_name="RPPO_optuna",
)
try:
    study.optimize(objective, n_trials=100, timeout=600)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print("    {}: {}".format(key, value))

Test model

In [None]:
# Reload model from disc
agent = sb3.PPO.load(r'.\train\train_center\RPPO_blogAtariSettings_BW.zip')

In [None]:
env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)

In [None]:
mean_reward, std_reward = sb3.common.evaluation.evaluate_policy(agent, env, n_eval_episodes=10, deterministic=True)
env.close()
mean_reward

In [12]:
test_env = VizDoomEnv(DEFAULT_SCENARIO_PATH, render=True)
for episode in range(2): 
    obs = test_env.reset()[0]
    done = False
    state = None
    episode_starts = True

    total_reward = 0
    while not done: 
        action, state = agent.predict(obs, state=state, episode_start=episode_starts)
        obs, reward, done, _, info = test_env.step(action)
        episode_starts = done
        time.sleep(0.1)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

test_env.close()

Total Reward for episode 0 is 7.0
Total Reward for episode 1 is 21.0


In [None]:
test_env.close()