Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PatrykChrabaszcz committed Feb 22, 2018
0 parents commit fd26d42
Show file tree
Hide file tree
Showing 12 changed files with 1,169 additions and 0 deletions.
108 changes: 108 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Pycharm and project related files and folders
.idea/


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
.static_storage/
.media/
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Canonical ES for benchmarking Atari

Code based on:
https://github.com/openai/evolution-strategies-starter

11 changes: 11 additions & 0 deletions configurations/sample_configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"optimizer": "CanonicalESOptimizer",
"settings": {
"learning_rate": 1,
"sigma": 0.01,
"c_sigma_factor": 1,
"mu": 1
},
"network": "Nature",
"nonlin_name": "elu"
}
206 changes: 206 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
from src.optimizers import OpenAIOptimizer, CanonicalESOptimizer, CanonicalESMeanOptimizer
from src.policy import Policy
from src.logger import Logger

from argparse import ArgumentParser
from mpi4py import MPI
import numpy as np
import time
import json
import gym


# This will allow us to create optimizer based on the string value from the configuration file.
# Add you optimizers to this dictionary.
optimizer_dict = {
'OpenAIOptimizer': OpenAIOptimizer,
'CanonicalESOptimizer': CanonicalESOptimizer,
'CanonicalESMeanOptimizer': CanonicalESMeanOptimizer
}


# Main function that executes training loop.
# Population size is derived from the number of CPUs
# and the number of episodes per CPU.
# One CPU (id: 0) is used to evaluate currently proposed
# solution in each iteration.
# run_name comes useful when the same hyperparameters
# are evaluated multiple times.
def main(ep_per_cpu, game, configuration_file, run_name):
start_time = time.time()

with open(configuration_file, 'r') as f:
configuration = json.loads(f.read())

env_name = '%sNoFrameskip-v4' % game

# MPI stuff
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
cpus = comm.Get_size()

# One cpu (rank 0) will evaluate results
train_cpus = cpus - 1

# Deduce population size
lam = train_cpus * ep_per_cpu

# Create environment
env = gym.make(env_name)

# Create policy (Deep Neural Network)
# Internally it applies preprocessing to the environment state
policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name'])

# Create reference batch used for normalization
# It will be overwritten with vb from worker with rank 0
vb = policy.get_vb()

# Extract vector with current parameters.
parameters = policy.get_parameters()

# Send parameters from worker 0 to all workers (MPI stuff)
# to ensure that every worker starts in the same position
comm.Bcast([parameters, MPI.FLOAT], root=0)
comm.Bcast([vb, MPI.FLOAT], root=0)

# Set the same virtual batch for each worker
if rank != 0:
policy.set_vb(vb)

# Create optimizer with user defined settings (hyperparameters)
OptimizerClass = optimizer_dict[configuration['optimizer']]
optimizer = OptimizerClass(parameters, lam, rank, configuration["settings"])

# Only rank 0 worker will log information from the training
logger = None
if rank == 0:
# Initialize logger, save virtual batch and save some basic stuff at the beginning
logger = Logger(optimizer.log_path(game, configuration['network'], run_name))
logger.save_vb(vb)

# Log basic stuff
logger.log('Game'.ljust(25) + '%s' % game)
logger.log('Network'.ljust(25) + '%s' % configuration['network'])
logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'])
logger.log('Number of CPUs'.ljust(25) + '%d' % cpus)
logger.log('Population'.ljust(25) + '%d' % lam)
logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters))

# Log basic info from the optimizer
optimizer.log_basic(logger)

# We will count number of steps
# frames = 4 * steps (3 * steps for SpaceInvaders)
steps_passed = 0
while True:
# Iteration start time
iter_start_time = time.time()
# Workers that run train episodes
if rank != 0:
# Empty arrays for each episode. We save: length, reward, noise index
lens = [0] * ep_per_cpu
rews = [0] * ep_per_cpu
inds = [0] * ep_per_cpu

# For each episode in this CPU we get new parameters,
# update policy network and perform policy rollout
for i in range(ep_per_cpu):
ind, p = optimizer.get_parameters()
policy.set_parameters(p)
e_rew, e_len = policy.rollout()
lens[i] = e_len
rews[i] = e_rew
inds[i] = ind

# Aggregate information, will later send it to each worker using MPI
msg = np.array(rews + lens + inds, dtype=np.int32)

# Worker rank 0 that runs evaluation episodes
else:
rews = [0] * ep_per_cpu
lens = [0] * ep_per_cpu
for i in range(ep_per_cpu):
ind, p = optimizer.get_parameters()
policy.set_parameters(p)
e_rew, e_len = policy.rollout()
rews[i] = e_rew
lens[i] = e_len

eval_mean_rew = np.mean(rews)
eval_max_rew = np.max(rews)

# Empty array, evaluation results are not used for the update
msg = np.zeros(3 * ep_per_cpu, dtype=np.int32)

# MPI stuff
# Initialize array which will be updated with information from all workers using MPI
results = np.empty((cpus, 3 * ep_per_cpu), dtype=np.int32)
comm.Allgather([msg, MPI.INT], [results, MPI.INT])

# Skip empty evaluation results from worker with id 0
results = results[1:, :]

# Extract IDs and rewards
rews = results[:, :ep_per_cpu].flatten()
lens = results[:, ep_per_cpu:(2*ep_per_cpu)].flatten()
ids = results[:, (2*ep_per_cpu):].flatten()

# Update parameters
optimizer.update(ids=ids, rewards=rews)

# Steps passed = Sum of episode steps from all offsprings
steps = np.sum(lens)
steps_passed += steps

# Write some logs for this iteration
# Using logs we are able to recover solution saved
# after 1 hour of training or after 1 billion frames
if rank == 0:
iteration_time = (time.time() - iter_start_time)
time_elapsed = (time.time() - start_time)/60
train_mean_rew = np.mean(rews)
train_max_rew = np.max(rews)
logger.log('------------------------------------')
logger.log('Iteration'.ljust(25) + '%f' % optimizer.iteration)
logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew)
logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew)
logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew)
logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew)
logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed)
logger.log('StepsThisIter'.ljust(25) + '%f' % steps)
logger.log('IterationTime'.ljust(25) + '%f' % iteration_time)
logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed)

# Give optimizer a chance to log its own stuff
optimizer.log(logger)
logger.log('------------------------------------')

# Write stuff for training curve plot
stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\
format(steps_passed, (time.time()-start_time),
eval_mean_rew, eval_max_rew, train_mean_rew, train_max_rew)
logger.write_general_stat(stat_string)
logger.write_optimizer_stat(optimizer.stat_string())

# Save currently proposed solution every 20 iterations
if optimizer.iteration % 20 == 1:
logger.save_parameters(optimizer.parameters, optimizer.iteration)


def parse_arguments():
parser = ArgumentParser()
parser.add_argument('-e', '--episodes_per_cpu',
help="Number of episode evaluations for each CPU, "
"population_size = episodes_per_cpu * Number of CPUs",
default=1, type=int)
parser.add_argument('-g', '--game', help="Atari Game used to train an agent")
parser.add_argument('-c', '--configuration_file', help='Path to configuration file')
parser.add_argument('-r', '--run_name', help='Name of the run, used to create log folder name', type=str)
args = parser.parse_args()
return args.episodes_per_cpu, args.game, args.configuration_file, args.run_name


if __name__ == '__main__':
ep_per_cpu, game, configuration_file, run_name = parse_arguments()
main(ep_per_cpu, game, configuration_file, run_name)
Loading

0 comments on commit fd26d42

Please sign in to comment.