In [1]:
#Helpful information
# https://stable-baselines.readthedocs.io/en/master/guide/examples.html#using-callback-monitoring-training
# actual rom files must be named rom.nes and placed in their specific folders... my path is:
# {path on colossus}
# Scenario files also go here, these are what determines how we reward the agent

# To Do:
# Train a 5m timestep model for each of the 9 games.
# Collect data, put it into a report. Prepare a presentation.
# try to auto-optimize their parameters and compare again, then
# Try to make our own, but time is pretty limited, so....

# Hard: SMB, Life Force, Megaman
# Medium: Breakout-Atari2600, Space Invaders, Asteroid
# Easy: CartPole-v0, Pendulum-v0 MountainCar-v0


In [2]:
#import retro

import random
import gym
import numpy as np
import os 
import time
import datetime
import tensorflow as tf

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy
from stable_baselines.deepq.policies import DQNPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import A2C, PPO2, TRPO


from smac.configspace import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, \
    UniformFloatHyperparameter, UniformIntegerHyperparameter

from smac.tae.execute_func import ExecuteTAFuncDict
from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC
import logging

tf.logging.set_verbosity(tf.logging.ERROR)


#mlp means Multilayer perceptron, and is probably the fastest but worst.
#CnnPolicies are for images only. MlpPolicies are made for other type of features (e.g. robot joints) 
#Dunno what this means exactly, but I copied it from the documentation.

# Documentation is here: https://stable-baselines.readthedocs.io/en/master/guide/examples.html#using-callback-monitoring-training

In [3]:
#env = retro.make(game="SuperMarioBros-Nes")
#env = retro.make(game="LifeForce-Nes")
#env = retro.make(game="MegaMan-Nes")
#env = retro.make(game="Asteroids-Atari2600")
#env = retro.make(game="Breakout-Atari2600") 
#env = retro.make(game="SpaceInvaders-Atari2600") 
#env = gym.make('Pendulum-v0')
env = gym.make('CartPole-v0')
#env = gym.make('MountainCar-v0')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
TIMESTEPS  = 100000
EPOCH = 100


In [4]:
def runA2C(config):
    start = time.time()
    model = A2C( MlpPolicy, env,verbose= False,gamma = config['gamma'], learning_rate = config['lr'], lr_schedule = config['schedule'])
    #model = PPO2(MlpPolicy, env, verbose=1)
    #model = TRPO(MlpPolicy, env, verbose=1)
    #model.learn(total_timesteps=max_timesteps, callback=callback)
    #model.save("savedModel")
    obs = env.reset()
    model.learn(total_timesteps=TIMESTEPS)
    episode_reward = 0
    for i in range(10000000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        #env.render()
        episode_reward += rewards
        if dones:
            print('Reward: %s' % episode_reward)
            break
    return float(-episode_reward[0])

In [5]:

logger = logging.getLogger("A2C-example")
logging.basicConfig(level=logging.INFO)
#logging.basicConfig(level=logging.DEBUG)  # Enable to show debug-output
logger.info("Running A2C.")

# Build Configuration Space which defines all parameters and their ranges.
# To illustrate different parameter types,
# we use continuous, integer and categorical parameters.
cs = ConfigurationSpace()

# We can add single hyperparameters:
do_schedule = CategoricalHyperparameter(
    "schedule", ['linear', 'constant', 'double_linear_con', 'middle_drop' ,'double_middle_drop'], default_value="constant")
do_lr = UniformFloatHyperparameter("lr", 1e-6, 1, default_value=0.01)
do_gamma = UniformFloatHyperparameter("gamma", 1e-6, 1, default_value=0.99)
cs.add_hyperparameters([do_schedule, do_lr, do_gamma])


INFO:A2C-example:Running A2C.


[schedule, Type: Categorical, Choices: {linear, constant, double_linear_con, middle_drop, double_middle_drop}, Default: constant,
 lr, Type: UniformFloat, Range: [1e-06, 1.0], Default: 0.01,
 gamma, Type: UniformFloat, Range: [1e-06, 1.0], Default: 0.99]

In [6]:
scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternative runtime)
                     "runcount-limit": EPOCH,  # maximum number of function evaluations
                     "cs": cs,               # configuration space
                     "deterministic": "true",
           
                     })

# To optimize, we pass the function to the SMAC-object
smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
            tae_runner=runA2C)

# Example call of the function with default values
# It returns: Status, Cost, Runtime, Additional Infos
def_value = smac.get_tae_runner().run(cs.get_default_configuration(), 1)[1]
print("Value for default configuration: %.2f" % (def_value))

# Start optimization

incumbent = smac.optimize()


#inc_value = smac.get_tae_runner().run(incumbent, 1)[1]
inc_value = runA2C(incumbent)
print("Optimized Value: %.2f" % (inc_value))

INFO:smac.utils.io.cmd_reader.CMDReader:Output to smac3-output_2019-04-23_22:20:19_615680
INFO:smac.facade.smac_facade.SMAC:Optimizing a deterministic scenario for quality without a tuner timeout - will make SMAC deterministic!


Reward: [4.]
Value for default configuration: -4.00
Reward: [163.]
Reward: [10.]
Reward: [1.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [6.]
Reward: [11.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [1.]
Reward: [7.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [8.]
Reward: [11.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [4.]
Reward: [3.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [2.]
Reward: [8.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [94.]
Reward: [6.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [1.]
Reward: [4.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [2.]
Reward: [5.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [110.]
Reward: [5.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [7.]
Reward: [5.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [6.]
Reward: [2.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [1.]
Reward: [117.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [40.]
Reward: [2.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [148.]
Reward: [4.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [9.]
Reward: [8.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [39.]
Reward: [134.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [7.]
Reward: [2.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [81.]
Reward: [6.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [2.]
Reward: [1.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [4.]
Reward: [149.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [9.]
Reward: [50.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -163.0000


Reward: [4.]
Reward: [169.]


INFO:smac.intensification.intensification.Intensifier:Challenger (-169.0000) is better than incumbent (-163.0000) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.intensification.Intensifier:  gamma : 0.99 -> 0.9621221340317133
INFO:smac.intensification.intensification.Intensifier:  lr : 0.01 -> 0.007579964788539638
INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [161.]
Reward: [5.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [34.]
Reward: [93.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [88.]
Reward: [130.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [116.]
Reward: [56.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [4.]
Reward: [2.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [3.]
Reward: [1.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [7.]
Reward: [8.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [77.]
Reward: [9.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [3.]
Reward: [93.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [10.]
Reward: [54.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -169.0000


Reward: [5.]
Reward: [191.]


INFO:smac.intensification.intensification.Intensifier:Challenger (-191.0000) is better than incumbent (-169.0000) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.intensification.Intensifier:  gamma : 0.9621221340317133 -> 0.9780442733113491
INFO:smac.intensification.intensification.Intensifier:  lr : 0.007579964788539638 -> 0.010350797890925353
INFO:smac.intensification.intensification.Intensifier:  schedule : 'constant' -> 'linear'
INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [11.]
Reward: [9.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [10.]
Reward: [131.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [63.]
Reward: [95.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [1.]
Reward: [4.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [5.]
Reward: [87.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [181.]
Reward: [3.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [8.]
Reward: [2.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [6.]
Reward: [164.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [125.]
Reward: [103.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [2.]
Reward: [4.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -191.0000


Reward: [135.]
Reward: [197.]


INFO:smac.intensification.intensification.Intensifier:Challenger (-197.0000) is better than incumbent (-191.0000) on 1 runs.
INFO:smac.intensification.intensification.Intensifier:Changes in incumbent:
INFO:smac.intensification.intensification.Intensifier:  gamma : 0.9780442733113491 -> 0.9917253287169163
INFO:smac.intensification.intensification.Intensifier:  lr : 0.010350797890925353 -> 0.009767740248820099
INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -197.0000


Reward: [166.]
Reward: [6.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -197.0000


Reward: [7.]
Reward: [1.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -197.0000


Reward: [6.]
Reward: [4.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -197.0000


Reward: [7.]
Reward: [181.]


INFO:smac.intensification.intensification.Intensifier:Updated estimated cost of incumbent on 1 runs: -197.0000


Reward: [3.]


INFO:smac.stats.stats.Stats:##########################################################
INFO:smac.stats.stats.Stats:Statistics:
INFO:smac.stats.stats.Stats:#Incumbent changed: 3
INFO:smac.stats.stats.Stats:#Target algorithm runs: 100 / 100.0
INFO:smac.stats.stats.Stats:#Configurations: 100
INFO:smac.stats.stats.Stats:Used wallclock time: 11665.73 / inf sec 
INFO:smac.stats.stats.Stats:Used target algorithm runtime: 11641.45 / inf sec
INFO:smac.stats.stats.Stats:##########################################################
INFO:smac.facade.smac_facade.SMAC:Final Incumbent: Configuration:
  gamma, Value: 0.9917253287169163
  lr, Value: 0.009767740248820099
  schedule, Value: 'linear'

INFO:smac.facade.smac_facade.SMAC:Estimated cost of incumbent: -197.000000


Reward: [80.]
Optimized Value: -80.00


In [7]:

incumbent


Configuration:
  gamma, Value: 0.9917253287169163
  lr, Value: 0.009767740248820099
  schedule, Value: 'linear'