# Setup

In [None]:
system_path = '/content/drive/MyDrive/GitHub/INM363-Project'
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append(system_path)

In [None]:
%%bash
# Install deps from 
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux

apt-get update  &> /dev/null


apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip  &> /dev/null

# Boost libraries
apt-get install libboost-all-dev  &> /dev/null

In [None]:
!pip install vizdoom --quiet
!pip install ray --quiet
!pip install ray['rllib'] --quiet
!pip install Ipython --upgrade --quiet


In [None]:
from src.vizdoom_gym.envs.VizDoomEnv import VizdoomEnv
from src.vizdoom_gym.envs.VizDoomEnv_def import VizDoomVeryDenseReward

In [None]:
from ray.tune.registry import register_env
import gym
import os
import ray
import ray.rllib.agents.ppo as ppo
from ray.rllib.algorithms.callbacks import RE3UpdateCallbacks
import shutil
import torch

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device: ", device, "\n")

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

# Initialize Ray

In [None]:
#need this to load vizdoom module 
system_path = '/content/drive/MyDrive/GitHub/INM363-Project/src' 
sys.path.append(system_path)

#need this to use gpu on ray 
os.environ['PYTHONPATH'] = '/content/drive/MyDrive/GitHub/INM363-Project' 
os.environ['PYTHONPATH']

In [None]:

chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern"
shutil.rmtree(chkpt_root, ignore_errors=True, onerror=None)


ray.shutdown()
print("Shutdown ray")

# start Ray -- add `local_mode=True` here for debugging
ray.init(ignore_reinit_error=True,  num_cpus =2, num_gpus = 1) #local_mode=True,

#ray.init(num_cpus= 2, num_gpus=1)

print("Initialized ray")

# register the custom environment
select_env = "VizDoomVeryDenseReward-v0"

register_env(select_env, lambda config: VizDoomVeryDenseReward())
#register_env(select_env, lambda config: VizdoomEnv())

print("registered environment")


# Training config

In [None]:
# configure the environment and create agent
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"
config["model"] = {"dim": 42, 
                   "grayscale": True,
                   }
config["num_gpus"] = 1
config["preprocessor_pref"] = "rllib"
config['explore'] = True 
#config['batch_mode'] = 'complete_episodes'


In [None]:

class RE3Callbacks(RE3UpdateCallbacks, config["callbacks"]):
  pass


In [None]:
config["framework"] = "tf"

#https://github.com/ray-project/ray/blob/c9c3f0745a9291a4de0872bdfa69e4ffdfac3657/rllib/utils/exploration/tests/test_random_encoder.py#L35

config["seed"] = 12345
config["callbacks"] = RE3Callbacks
config["exploration_config"] = {
    "type": "RE3",
     # the dimensionality of the observation embedding vectors in latent space.
     "embeds_dim": 128,
     "rho": 0.1, # Beta decay factor, used for on-policy algorithm.
     "k_nn": 7, # Number of neighbours to set for K-NN entropy estimation.
     "encoder_net_config": {
         "fcnet_hiddens": [],
         "fcnet_activation": "relu",
     },
     # `reward = r + beta * intrinsic_reward`
     "beta": 0.2,
     # Schedule to use for beta decay, one of constant" or "linear_decay".
     "beta_schedule": 'constant',
     # Specify, which exploration sub-type to use (usually, the algo's "default"
     # exploration, e.g. EpsilonGreedy for DQN, StochasticSampling for PG/SAC).
     "sub_exploration": {
         "type": "StochasticSampling",
     }
}

agent = ppo.PPOTrainer(config, env=select_env)

print("created agent")

**Training** loop

In [None]:
import pandas as pd
import time 

cols = ["checkpoint", "eps_reward_min", "eps_reward_mean", "eps_reward_max", "eps_len_mean", "episodes_this_iter"]
results_df = pd.DataFrame(columns = cols) 

In [None]:
chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/no_reward"
chkpt_file  = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/no_reward/checkpoint_000800"
agent.restore(chkpt_file)

In [None]:
status = "{:2d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:4.2f}"
start_n = 0
n_iter = 300

print("started training loop")
time_start = time.time() 

chkpt_root = chkpt_root = "/content/drive/MyDrive/GitHub/INM363-Project/model_checkpoints/re3/dense_new_pattern"


# train a policy with RLlib using PPO
for n in range(start_n, n_iter):
  
    result = agent.train()

    #change this to  10 or 20 
    if (n+1) % 20 == 0 or n == 0: 
      chkpt_file = agent.save(chkpt_root)
      print(f"Saved checkpoint {n+1} at {chkpt_file}")
    #chkpt_file = "not saving checkpoints"

    print(status.format(
        n + 1,
        result["episode_reward_min"],
        result["episode_reward_mean"],
        result["episode_reward_max"],
        result["episode_len_mean"]
    ))

    #save metrics
    row = {'checkpoint': n+1,
       "eps_reward_min": result["episode_reward_min"],
       "eps_reward_mean": result["episode_reward_mean"],
       "eps_reward_max": result["episode_reward_max"],
       "eps_len_mean": result["episode_len_mean"],
       "episodes_this_iter": result["episodes_this_iter"]
       }
    results_df = results_df.append(row, ignore_index = True)


print(f"Total time elapsed: {(time.time()-time_start)/60}")

print("ending training loop")

ray.shutdown()
print("shutdown ray")

# save results file

In [None]:
from pathlib import Path 

fname = chkpt_root + '/result.csv'
fpath = Path(fname)
fpath.parent.mkdir(parents=True, exist_ok = True)
results_df.to_csv(fpath)
print(f"Saved results file to {fname}")
