# Setup

In [None]:
%%bash
# Install deps from 
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux

apt-get update


apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip

# Boost libraries
apt-get install libboost-all-dev

In [2]:
!pip install vizdoom
!pip install ray 
!pip install ray['rllib']
!pip install Ipython --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vizdoom
  Downloading vizdoom-1.1.13.tar.gz (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 1.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: vizdoom
  Building wheel for vizdoom (PEP 517) ... [?25l[?25hdone
  Created wheel for vizdoom: filename=vizdoom-1.1.13-cp37-cp37m-linux_x86_64.whl size=14101153 sha256=74cba8fbb0e2f22d3b16568391761ab73aa11e2343930e554ae08cc6dbb2a8a6
  Stored in directory: /root/.cache/pip/wheels/ac/37/ae/8e648023f66bb4c473701f94ce126032ff39ad9759ca0645a7
Successfully built vizdoom
Installing collected packages: vizdoom
Successfully installed vizdoom-1.1.13
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray
  Downl

In [1]:
system_path = '/content/drive/MyDrive/GitHub/INM363-Project'
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append(system_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from src.vizdoom_gym.envs.VizDoomEnv import VizdoomEnv
from src.vizdoom_gym.envs.VizDoomEnv_def import VizDoomVeryDenseReward

In [3]:
from ray.tune.registry import register_env
import gym
import os
import ray
import ray.rllib.agents.ppo as ppo
import shutil
import torch

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device, "\n")

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

device:  cuda:0 

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


# Initialize Ray

In [5]:
#need this to load vizdoom module 
system_path = '/content/drive/MyDrive/GitHub/INM363-Project/src' 
sys.path.append(system_path)

#need this to use gpu on ray 
os.environ['PYTHONPATH'] = '/content/drive/MyDrive/GitHub/INM363-Project' 
os.environ['PYTHONPATH']

'/content/drive/MyDrive/GitHub/INM363-Project'

In [6]:

chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/easy_no_reward"
shutil.rmtree(chkpt_root, ignore_errors=True, onerror=None)


ray.shutdown()
print("Shutdown ray")

# start Ray -- add `local_mode=True` here for debugging
ray.init(ignore_reinit_error=True,  num_cpus =2, num_gpus = 1) #local_mode=True,

#ray.init(num_cpus= 2, num_gpus=1)

print("Initialized ray")

# register the custom environment
select_env = "VizDoomVeryDenseReward-v0"

register_env(select_env, lambda config: VizDoomVeryDenseReward())
#register_env(select_env, lambda config: VizdoomEnv())

print("registered environment")


Shutdown ray


2022-09-19 12:30:31,130	INFO worker.py:1518 -- Started a local Ray instance.


Initialized ray
registered environment


Training config

In [7]:
# configure the environment and create agent
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"
#config["num_workers"] = 1
config["framework"] = "torch"
config["model"] = {"dim": 42, 
                   "grayscale": True,
                   }
config["num_gpus"] = 1
config["preprocessor_pref"] = "rllib"
config['explore'] = True 
#config['batch_mode'] = 'complete_episodes'


In [11]:

#activating curiosity as the exploration class : https://docs.ray.io/en/latest/rllib/rllib-algorithms.html

#set to 0 because of: https://discuss.ray.io/t/scaling-curiosity-like-exploration-modules-on-multiple-workers/2267
config["num_workers"] = 0 #check why this is set to 0!  

config["exploration_config"] = {
    "type": "Curiosity",  # <- Use the Curiosity module for exploring.
    "eta": 0.01, #0.001,  # Weight for intrinsic rewards before being added to extrinsic ones.
    "lr": 0.001,  # Learning rate of the curiosity (ICM) module.
    "feature_dim": 288,  # Dimensionality of the generated feature vectors.
    # Setup of the feature net (used to encode observations into feature (latent) vectors).
    "feature_net_config": {
        "fcnet_hiddens": [],
        "fcnet_activation": "relu",
    },
    "inverse_net_hiddens": [256],  # Hidden layers of the "inverse" model.
    "inverse_net_activation": "relu",  # Activation of the "inverse" model.
    "forward_net_hiddens": [256],  # Hidden layers of the "forward" model.
    "forward_net_activation": "relu",  # Activation of the "forward" model.
    "beta": 0.2,  # Weight for the "forward" loss (beta) over the "inverse" loss (1.0 - beta).
    # Specify, which exploration sub-type to use (usually, the algo's "default"
    # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).
    "sub_exploration": {
        "type": "StochasticSampling",
    }
}


"""
used vf_clip = 400 for easy and new_dense settings 
= 600 for sparse settings 
"""

#config["vf_clip_param"] = 600

#changed due to warning
# Clip param for the value function. Note that this is sensitive to the
# scale of the rewards. If your expected V is large, increase this. (previosuly 10) 
#2022-08-30 17:15:25,928	WARNING ppo.py:465 -- The mean reward returned from the environment is 5066.82568359375 but the
# vf_clip_param is set to 100. Consider increasing it for policy: default_policy to improve value function convergence.

#config["vf_clip_param"] = 10000  # changin this back to try scaled reward setting 
#100
#2022-09-01 12:20:27,151	WARNING ppo.py:465 -- The mean reward returned from the environment is 16827.6640625 but the vf_clip_param is set to 10.0. Consider increasing it for policy: default_policy to improve value function convergence.



agent = ppo.PPOTrainer(config, env=select_env)

print("created agent")

config file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/very_dense_reward.cfg
scenario file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/train/easy_no_reward_rs.wad
episode timeout: 400
screen resolution: 320X240




created agent


Training loop

In [12]:
import pandas as pd
import time 

cols = ["checkpoint", "eps_reward_min", "eps_reward_mean", "eps_reward_max", "eps_len_mean", "episodes_this_iter"]
results_df = pd.DataFrame(columns = cols) 

In [13]:
#chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/sparse"
#chkpt_file  = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/sparse/checkpoint_000140"
#agent.restore(chkpt_file)

In [14]:
status = "{:2d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:4.2f}"
start_n = 0 
n_iter = 200

print("started training loop")
time_start = time.time() 

# train a policy with RLlib using PPO
for n in range(start_n, n_iter):
  
    result = agent.train()

    #change this to  10 or 20 
    if (n+1) % 10 == 0 or n == 0: 
      chkpt_file = agent.save(chkpt_root)
      print(f"Saved checkpoint {n+1} at {chkpt_file}")
    #chkpt_file = "not saving checkpoints"

    print(status.format(
        n + 1,
        result["episode_reward_min"],
        result["episode_reward_mean"],
        result["episode_reward_max"],
        result["episode_len_mean"]
    ))

    #save metrics
    row = {'checkpoint': n+1,
       "eps_reward_min": result["episode_reward_min"],
       "eps_reward_mean": result["episode_reward_mean"],
       "eps_reward_max": result["episode_reward_max"],
       "eps_len_mean": result["episode_len_mean"],
       "episodes_this_iter": result["episodes_this_iter"]
       }
    results_df = results_df.append(row, ignore_index = True)


print(f"Total time elapsed: {(time.time()-time_start)/60}")

print("ending training loop")

ray.shutdown()
print("shutdown ray")

started training loop




Saved checkpoint 1 at /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/easy_no_reward/checkpoint_000001
 1 reward   0.00/  0.00/  0.00 len 95.32
 2 reward   0.00/  0.00/  0.00 len 95.35
 3 reward   0.00/  0.00/  0.00 len 90.47
 4 reward   0.00/  0.00/  0.00 len 85.78
 5 reward   0.00/  0.00/  0.00 len 87.61
 6 reward   0.00/  0.00/  0.00 len 87.47
 7 reward   0.00/  0.00/  0.00 len 84.62
 8 reward   0.00/  0.00/  0.00 len 89.53
 9 reward   0.00/  0.00/  0.00 len 92.41
Saved checkpoint 10 at /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/easy_no_reward/checkpoint_000010
10 reward   0.00/  0.00/  0.00 len 90.53
11 reward   0.00/  0.00/  0.00 len 90.61
12 reward   0.00/  0.00/  0.00 len 92.49
13 reward   0.00/  0.00/  0.00 len 93.40
14 reward   0.00/  0.00/  0.00 len 90.56
15 reward   0.00/  0.00/  0.00 len 89.61
16 reward   0.00/  0.00/  0.00 len 89.66
17 reward   0.00/  0.00/  0.00 len 92.47
18 reward   0.00/  0.00/  0.00 len 91.43
19 reward   0.00/

KeyboardInterrupt: ignored

# save results file

In [15]:
from pathlib import Path 

fname = chkpt_root + '/result.csv'
fpath = Path(fname)
fpath.parent.mkdir(parents=True, exist_ok = True)
results_df.to_csv(fpath)
print(f"Saved results file to {fname}")


Saved results file to /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/icm/easy_no_reward/result.csv


In [34]:
result

{'custom_metrics': {},
 'episode_media': {},
 'num_recreated_workers': 0,
 'info': {'learner': {'default_policy': {'custom_metrics': {},
    'learner_stats': {'cur_kl_coeff': 2.8832519531249994,
     'cur_lr': 5.0000000000000016e-05,
     'total_loss': 599.9217031171245,
     'policy_loss': -0.10544964313827535,
     'vf_loss': 600.0,
     'vf_explained_var': -2.484552321895476e-05,
     'kl': 0.009417383247753439,
     'entropy': 0.7577878890498992,
     'entropy_coeff': 0.0},
    'model': {}}},
  'num_env_steps_sampled': 800000,
  'num_env_steps_trained': 800000,
  'num_agent_steps_sampled': 800000,
  'num_agent_steps_trained': 800000},
 'sampler_results': {'episode_reward_max': 10.0,
  'episode_reward_min': 0.0,
  'episode_reward_mean': 1.09,
  'episode_len_mean': 185.88,
  'episode_media': {},
  'episodes_this_iter': 21,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [0.0,
    0.0,
    1.0