## Set Environment:

There are some gym dependencies that seem to outdated for running n the GPU. The code below solves it but I am not entirely sure why all those packages are needed. Anyway, it does not load up the memory by too much so it is fine for now. 

In [1]:
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules

if IS_COLAB:
    !apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    %pip install -U gym>=0.21.0
    %pip install -U gym[box2d,atari,accept-rom-license]
else:
    raise Warning("Unknown system!")

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Waiting for headers] [C[0m                                                                               Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic

Test the gym envioronment:

In [2]:
import gym
gym_env = gym.make("LunarLanderContinuous-v2")


## Imports

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
import os
import numpy as np, time, random, scipy
import unittest 
%pip install -U fastrand 
import fastrand 

import torch
import argparse
import pickle

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Collecting fastrand
  Downloading fastrand-1.3.0.tar.gz (3.3 kB)
Building wheels for collected packages: fastrand
  Building wheel for fastrand (setup.py) ... [?25l[?25hdone
  Created wheel for fastrand: filename=fastrand-1.3.0-cp37-cp37m-linux_x86_64.whl size=16567 sha256=17dd8f97ad845220cf437592ac25fb40d3ac595dedc94a152c889b07503c4bca
  Stored in directory: /root/.cache/pip/wheels/7d/73/56/57c2f1af2a5e2ecdb6234b732e898a4e26707cc2a2008dc308
Successfully built fastrand
Installing collected packages: fastrand
Successfully installed fastrand-1.3.0


In [10]:
sys.path.clear()
sys.path.insert(0,'/content/gdrive/MyDrive/Thesis_PDERL')
sys.path.insert(1,'/usr/local/lib/python3.7/dist-packages pip/fastrand')
import core
from parameters import Parameters
from core import mod_utils as utils, agent
from core.operator_runner import OperatorRunner

print(sys.path)

['/content/gdrive/MyDrive/Thesis_PDERL', '/usr/local/lib/python3.7/dist-packages pip/fastrand']


## Parse arguments:

In [13]:
parser = argparse.ArgumentParser()
parser.add_argument('-env', help='Environment Choices: (Swimmer-v2) (LunarLanderContinuous-v2)', type=str, default = 'LunarLanderContinuous-v2')
parser.add_argument('-num_games', help = 'Number of complete games to play', default = 5000)
#  QD equivalent of num_games: 50 000 games = 400 iters x 5 emitters x 25 batch_size
parser.add_argument('-seed', help='Random seed to be used', type=int, default=7)
parser.add_argument('-disable_cuda', help='Disables CUDA', action='store_true')
parser.add_argument('-render', help='Render gym episodes', action='store_true')
parser.add_argument('-sync_period', help="How often to sync to population", type=int)
parser.add_argument('-novelty', help='Use novelty exploration', action='store_true')
parser.add_argument('-proximal_mut', help='Use safe mutation', action='store_true')
parser.add_argument('-distil', help='Use distilation crossover', action='store_true')
parser.add_argument('-distil_type', help='Use distilation crossover. Choices: (fitness) (distance)',
                    type=str, default='fitness')
parser.add_argument('-per', help='Use Prioritised Experience Replay', action='store_true')
parser.add_argument('-mut_mag', help='The magnitude of the mutation', type=float, default=0.05)
parser.add_argument('-mut_noise', help='Use a random mutation magnitude', action='store_true')
parser.add_argument('-verbose_mut', help='Make mutations verbose', action='store_true')
parser.add_argument('-verbose_crossover', help='Make crossovers verbose', action='store_true')
parser.add_argument('-logdir', help='Folder where to save results', type=str, default = 'pderl/logs_ddpg')
parser.add_argument('-opstat', help='Store statistics for the variation operators', action='store_true')
parser.add_argument('-opstat_freq', help='Frequency (in generations) to store operator statistics', type=int, default=1)
parser.add_argument('-save_periodic', help='Save actor, critic and memory periodically', action='store_true')
parser.add_argument('-next_save', help='Generation save frequency for save_periodic', type=int, default=500)
parser.add_argument('-test_operators', help='Runs the operator runner to test the operators', action='store_true')

args,_ = parser.parse_known_args()
args = Parameters(args)  # Inject the cla arguments in the parameters object


32


In [14]:
tracker = utils.Tracker(args, ['erl'], '_score.csv')  # Initiate tracker
frame_tracker = utils.Tracker(args, ['frame_erl'], '_score.csv')  # Initiate tracker
time_tracker = utils.Tracker(args, ['time_erl'], '_score.csv')
ddpg_tracker = utils.Tracker(args, ['ddpg'], '_score.csv')
selection_tracker = utils.Tracker(args, ['elite', 'selected', 'discarded'], '_selection.csv')

# Create Env
env = utils.NormalizedActions(gym_env)
  
args.action_dim = env.action_space.shape[0]
args.state_dim = env.observation_space.shape[0]

# Write the args to a the info file and print them
args.write_params(stdout=True)

# Seed
env.reset(seed= args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

# Tests the variation operators after that is saved first with -save_periodic
if args.test_operators:
    operator_runner = OperatorRunner(args, env)
    operator_runner.run()
    exit()

# Create Agent
my_agent = agent.Agent(args, env)
print('Running', args.env_name, ' State_dim:', args.state_dim, ' Action_dim:', args.action_dim)


{   'action_dim': 2,
    'alpha': 0.7,
    'batch_size': 128,
    'beta_zero': 0.5,
    'buffer_size': 100000,
    'crossover_prob': 0.0,
    'device': device(type='cuda'),
    'distil': False,
    'distil_type': 'fitness',
    'elite_fraction': 0.1,
    'env_name': 'LunarLanderContinuous-v2',
    'frac_frames_train': 1.0,
    'gamma': 0.99,
    'individual_bs': 8000,
    'learn_start': 1564.5,
    'ls': 32,
    'mutation_batch_size': 256,
    'mutation_mag': 0.05,
    'mutation_noise': False,
    'mutation_prob': 0.9,
    'next_save': 200,
    'ns': False,
    'ns_epochs': 10,
    'num_evals': 1,
    'num_games': 5000,
    'opstat': False,
    'opstat_freq': 1,
    'per': False,
    'pop_size': 30,
    'proximal_mut': False,
    'render': False,
    'replace_old': True,
    'rl_to_ea_synch_period': 5,
    'save_foldername': 'pderl/logs',
    'save_periodic': False,
    'seed': 7,
    'state_dim': 8,
    'tau': 0.001,
    'test_operators': False,
    'use_done_mask': True,
    'use_ln'

## Main loop:
The PD-ERL training loop. It contains both the evolutionary part and the DDPG updates.

In [15]:
next_save = args.next_save; time_start = time.time()
while my_agent.num_games <= args.num_games:

    # evaluate over all games 
    stats = my_agent.train()

    #retrieve statistics
    best_train_fitness = stats['best_train_fitness']
    erl_score = stats['test_score']
    elite_index = stats['elite_index']
    ddpg_reward = stats['ddpg_reward']
    policy_gradient_loss = stats['pg_loss']
    behaviour_cloning_loss = stats['bc_loss']
    population_novelty = stats['pop_novelty']

    print('#Games:', my_agent.num_games, '#Frames:', my_agent.num_frames,
          ' Train_Max:', '%.2f'%best_train_fitness if best_train_fitness is not None else None,
          ' Test_Score:','%.2f'%erl_score if erl_score is not None else None,
          ' Avg:','%.2f'%tracker.all_tracker[0][1],
          ' ENV:  '+ args.env_name,
          ' DDPG Reward:', '%.2f'%ddpg_reward,
          ' PG Loss:', '%.4f' % policy_gradient_loss)

    elite = my_agent.evolver.selection_stats['elite']/my_agent.evolver.selection_stats['total']
    selected = my_agent.evolver.selection_stats['selected'] / my_agent.evolver.selection_stats['total']
    discarded = my_agent.evolver.selection_stats['discarded'] / my_agent.evolver.selection_stats['total']

    print()
    tracker.update([erl_score], my_agent.num_games)
    frame_tracker.update([erl_score], my_agent.num_frames)
    time_tracker.update([erl_score], time.time()-time_start)
    ddpg_tracker.update([ddpg_reward], my_agent.num_frames)
    selection_tracker.update([elite, selected, discarded], my_agent.num_frames)

    # Save Policy
    if my_agent.num_games > next_save:
        next_save += args.next_save
        if elite_index is not None:
            torch.save(my_agent.pop[elite_index].actor.state_dict(), os.path.join(args.save_foldername,
                                                                                'evo_net.pkl'))

            if args.save_periodic:
                save_folder = os.path.join(args.save_foldername, 'models')
                if not os.path.exists(save_folder):
                    os.makedirs(save_folder)

                actor_save_name = os.path.join(save_folder, 'evo_net_actor_{}.pkl'.format(next_save))
                critic_save_name = os.path.join(save_folder, 'evo_net_critic_{}.pkl'.format(next_save))
                buffer_save_name = os.path.join(save_folder, 'champion_buffer_{}.pkl'.format(next_save))

                torch.save(my_agent.pop[elite_index].actor.state_dict(), actor_save_name)
                torch.save(my_agent.rl_my_agent.critic.state_dict(), critic_save_name)
                with open(buffer_save_name, 'wb+') as buffer_file:
                    pickle.dump(my_agent.rl_my_agent.buffer, buffer_file)

        print("Progress Saved")


KeyboardInterrupt: ignored