# Setup

In [1]:
system_path = '/content/drive/MyDrive/GitHub/INM363-Project'
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append(system_path)

Mounted at /content/drive


In [2]:
%%bash
# Install deps from 
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux

apt-get update  &> /dev/null


apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip  &> /dev/null

# Boost libraries
apt-get install libboost-all-dev  &> /dev/null

In [3]:
!pip install vizdoom --quiet
!pip install ray --quiet
!pip install ray['rllib'] --quiet
!pip install Ipython --upgrade --quiet


[K     |████████████████████████████████| 15.7 MB 16.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for vizdoom (PEP 517) ... [?25l[?25hdone
[K     |████████████████████████████████| 59.4 MB 1.2 MB/s 
[K     |████████████████████████████████| 8.8 MB 39.8 MB/s 
[K     |████████████████████████████████| 4.1 MB 56.5 MB/s 
[K     |████████████████████████████████| 468 kB 66.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.2 MB/s 
[K     |████████████████████████████████| 626 kB 65.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 125 kB 66.7 MB/s 
[?25h  Building wheel for gym (PEP 517) ... [?25l[?25hdone
[K     |████████████████████████████████| 793 kB 12.

In [4]:
from src.vizdoom_gym.envs.VizDoomEnv import VizdoomEnv
from src.vizdoom_gym.envs.VizDoomEnv_def import VizDoomVeryDenseReward

In [5]:
from ray.tune.registry import register_env
import gym
import os
import ray
import ray.rllib.agents.ppo as ppo
from ray.rllib.algorithms.callbacks import RE3UpdateCallbacks
import shutil
import torch

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device, "\n")

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

device:  cuda:0 

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


# Initialize Ray

In [7]:
#need this to load vizdoom module 
system_path = '/content/drive/MyDrive/GitHub/INM363-Project/src' 
sys.path.append(system_path)

#need this to use gpu on ray 
os.environ['PYTHONPATH'] = '/content/drive/MyDrive/GitHub/INM363-Project' 
os.environ['PYTHONPATH']

'/content/drive/MyDrive/GitHub/INM363-Project'

In [13]:

chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern"
shutil.rmtree(chkpt_root, ignore_errors=True, onerror=None)


ray.shutdown()
print("Shutdown ray")

# start Ray -- add `local_mode=True` here for debugging
ray.init(ignore_reinit_error=True,  num_cpus =2, num_gpus = 1) #local_mode=True,

#ray.init(num_cpus= 2, num_gpus=1)

print("Initialized ray")

# register the custom environment
select_env = "VizDoomVeryDenseReward-v0"

register_env(select_env, lambda config: VizDoomVeryDenseReward())
#register_env(select_env, lambda config: VizdoomEnv())

print("registered environment")


Shutdown ray


2022-09-27 17:19:12,892	INFO worker.py:1518 -- Started a local Ray instance.


Initialized ray
registered environment


# Training config

In [9]:
# configure the environment and create agent
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"
config["model"] = {"dim": 42, 
                   "grayscale": True,
                   }
config["num_gpus"] = 1
config["preprocessor_pref"] = "rllib"
config['explore'] = True 
#config['batch_mode'] = 'complete_episodes'


In [10]:

class RE3Callbacks(RE3UpdateCallbacks, config["callbacks"]):
  pass


In [14]:
config["framework"] = "tf"

#https://github.com/ray-project/ray/blob/c9c3f0745a9291a4de0872bdfa69e4ffdfac3657/rllib/utils/exploration/tests/test_random_encoder.py#L35

config["seed"] = 12345
config["callbacks"] = RE3Callbacks
config["exploration_config"] = {
    "type": "RE3",
     # the dimensionality of the observation embedding vectors in latent space.
     "embeds_dim": 128,
     "rho": 0.1, # Beta decay factor, used for on-policy algorithm.
     "k_nn": 7, # Number of neighbours to set for K-NN entropy estimation.
     # Configuration for the encoder network, producing embedding vectors from observations.
     # This can be used to configure fcnet- or conv_net setups to properly process any
     # observation space. By default uses the Policy model configuration.
     "encoder_net_config": {
         "fcnet_hiddens": [],
         "fcnet_activation": "relu",
     },
     # Hyperparameter to choose between exploration and exploitation. A higher value of beta adds
     # more importance to the intrinsic reward, as per the following equation
     # `reward = r + beta * intrinsic_reward`
     "beta": 0.2,
     # Schedule to use for beta decay, one of constant" or "linear_decay".
     "beta_schedule": 'constant',
     # Specify, which exploration sub-type to use (usually, the algo's "default"
     # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).
     "sub_exploration": {
         "type": "StochasticSampling",
     }
}

agent = ppo.PPOTrainer(config, env=select_env)

print("created agent")

[2m[36m(RolloutWorker pid=9116)[0m config file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/very_dense_reward.cfg
[2m[36m(RolloutWorker pid=9116)[0m scenario file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/train/dense_new_pattern_rs.wad
[2m[36m(RolloutWorker pid=9116)[0m episode timeout: 400
[2m[36m(RolloutWorker pid=9116)[0m screen resolution: 320X240
[2m[36m(RolloutWorker pid=9115)[0m config file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/very_dense_reward.cfg
[2m[36m(RolloutWorker pid=9115)[0m scenario file: /content/drive/MyDrive/GitHub/INM363-Project/scenarios/custom/train/dense_new_pattern_rs.wad
[2m[36m(RolloutWorker pid=9115)[0m episode timeout: 400
[2m[36m(RolloutWorker pid=9115)[0m screen resolution: 320X240


[2m[36m(RolloutWorker pid=9116)[0m   "Function `env.seed(seed)` is marked as deprecated and will be removed in the future. "
[2m[36m(RolloutWorker pid=9115)[0m   "Function `env.seed(seed)` is marked as deprecated and will be removed in the future. "
2022-09-27 17:19:38,472	INFO trainable.py:164 -- Trainable.setup took 22.667 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


created agent


**Training** loop

In [15]:
import pandas as pd
import time 

cols = ["checkpoint", "eps_reward_min", "eps_reward_mean", "eps_reward_max", "eps_len_mean", "episodes_this_iter"]
results_df = pd.DataFrame(columns = cols) 

In [16]:
chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/no_reward"
chkpt_file  = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/no_reward/checkpoint_000800"
agent.restore(chkpt_file)

2022-09-27 17:19:50,408	INFO trainable.py:669 -- Restored on 172.28.0.2 from checkpoint: /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/no_reward/checkpoint_000800
2022-09-27 17:19:50,414	INFO trainable.py:677 -- Current state after restoring: {'_iteration': 800, '_timesteps_total': None, '_time_total': 41944.859322309494, '_episodes_total': 9327}


In [18]:
status = "{:2d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:4.2f}"
start_n = 0
n_iter = 300

print("started training loop")
time_start = time.time() 

chkpt_root = chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern"


# train a policy with RLlib using PPO
for n in range(start_n, n_iter):
  
    result = agent.train()

    #change this to  10 or 20 
    if (n+1) % 20 == 0 or n == 0: 
      chkpt_file = agent.save(chkpt_root)
      print(f"Saved checkpoint {n+1} at {chkpt_file}")
    #chkpt_file = "not saving checkpoints"

    print(status.format(
        n + 1,
        result["episode_reward_min"],
        result["episode_reward_mean"],
        result["episode_reward_max"],
        result["episode_len_mean"]
    ))

    #save metrics
    row = {'checkpoint': n+1,
       "eps_reward_min": result["episode_reward_min"],
       "eps_reward_mean": result["episode_reward_mean"],
       "eps_reward_max": result["episode_reward_max"],
       "eps_len_mean": result["episode_len_mean"],
       "episodes_this_iter": result["episodes_this_iter"]
       }
    results_df = results_df.append(row, ignore_index = True)


print(f"Total time elapsed: {(time.time()-time_start)/60}")

print("ending training loop")

ray.shutdown()
print("shutdown ray")

started training loop
Saved checkpoint 1 at /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern/checkpoint_000801
 1 reward   0.00/  2.37/ 13.00 len 89.72
 2 reward   0.00/  2.35/ 13.00 len 91.92
 3 reward   0.00/  2.41/ 13.00 len 89.50
 4 reward   0.00/  2.08/ 13.00 len 90.66
 5 reward   0.00/  1.70/ 12.00 len 91.30
 6 reward   0.00/  1.86/ 13.00 len 91.49
 7 reward   0.00/  2.32/ 13.00 len 89.01
 8 reward   0.00/  2.24/ 12.00 len 89.45
 9 reward   0.00/  2.81/ 12.00 len 84.92
10 reward   0.00/  2.80/ 13.00 len 85.20
11 reward   0.00/  2.58/ 14.00 len 86.94
12 reward   0.00/  2.33/ 14.00 len 89.38
13 reward   0.00/  2.18/ 12.00 len 88.71
14 reward   0.00/  2.30/ 12.00 len 86.76
15 reward   0.00/  2.32/ 13.00 len 89.70
16 reward   0.00/  2.62/ 13.00 len 89.46
17 reward   0.00/  2.41/ 13.00 len 92.28
18 reward   0.00/  2.22/ 13.00 len 92.73
19 reward   0.00/  2.41/ 12.00 len 88.87
Saved checkpoint 20 at /content/drive/MyDrive/GitHub/INM363-Project/model_

  return np.nanmean(tower_data)


116 reward   0.00/  4.03/ 15.00 len 83.88
117 reward   0.00/  4.44/ 15.00 len 78.61
118 reward   0.00/  4.59/ 14.00 len 77.62
119 reward   0.00/  4.22/ 14.00 len 79.83
Saved checkpoint 120 at /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern/checkpoint_000920
120 reward   0.00/  4.19/ 13.00 len 82.55
121 reward   0.00/  4.35/ 14.00 len 82.14
122 reward   0.00/  4.20/ 15.00 len 79.74
123 reward   0.00/  4.16/ 15.00 len 78.90
124 reward   0.00/  3.95/ 14.00 len 80.20
125 reward   0.00/  3.55/ 15.00 len 86.06
126 reward   0.00/  3.52/ 15.00 len 86.07
127 reward   0.00/  3.97/ 15.00 len 83.13
128 reward   0.00/  3.97/ 15.00 len 81.48
129 reward   0.00/  3.73/ 15.00 len 83.42
130 reward   0.00/  3.57/ 15.00 len 85.11
131 reward   0.00/  3.81/ 14.00 len 82.22
132 reward   0.00/  4.04/ 14.00 len 83.56
133 reward   0.00/  3.46/ 14.00 len 91.36
134 reward   0.00/  3.26/ 14.00 len 93.37
135 reward   0.00/  3.83/ 14.00 len 84.61
136 reward   0.00/  4.63/ 15.00 l

# save results file

In [19]:
from pathlib import Path 

fname = chkpt_root + '/result.csv'
fpath = Path(fname)
fpath.parent.mkdir(parents=True, exist_ok = True)
results_df.to_csv(fpath)
print(f"Saved results file to {fname}")


Saved results file to /content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern/result.csv
