In [None]:
import numpy as np
from utils import *
from tqdm import tqdm

import os 
import shutil

import wandb 
import warnings

warnings.filterwarnings("ignore")
os.environ["WANDB_NOTEBOOK_NAME"] = "sweep.ipynb"

In [None]:
experiment_parameters = {
    "num_runs" : 1,
    "num_episodes" : 1000,
    # OpenAI Gym environments allow for a timestep limit timeout, causing episodes to end after some number of timesteps
    "timeout" : 500
}

environment_parameters = {}

current_env = LunarLanderEnvironment
current_agent = Agent

In [None]:
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)
        
    # save sum of reward at the end of each episode
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}
    agent_info = agent_parameters

    for run in range(1, experiment_parameters["num_runs"]+1):
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        env_info["seed"] = run

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):

            # run episode
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            # get cumulative reward
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run-1, episode-1] = episode_reward

    average_sum_reward = np.mean(agent_sum_reward, axis=0)
    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')

    return average_sum_reward[-1]

In [None]:
sweep_config = {
    "method": "random", 
    "metric": {
      "name": "sum_reward",
      "goal": "maximize"   
    },
    "parameters": {
        "num_hidden_units": {
            "distribution": "int_uniform",
            "min": 16,
            "max": 512
        },
        "step_size": {
            "distribution": "log_uniform_values",
            "min": 1e-4,
            "max": 1e-2
        },
        "tau": {
            "distribution": "log_uniform_values",
            "min": 1e-4,
            "max": 1e-2
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="RL_Lunar_Lander")

In [None]:
def train():
    config_defaults = {
        "num_hidden_units": 256,
        "step_size": 1e-3,
        "tau": 1e-3,
    }

    wandb.init(config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    agent_parameters = {
        'network_config': {
            'state_dim': 8,
            'num_hidden_units': config.num_hidden_units,
            'num_actions': 4
        },
        'optimizer_config': {
            'step_size': config.step_size,
            'beta_m': 0.9, 
            'beta_v': 0.999,
            'epsilon': 1e-8
        },
        'replay_buffer_size': 50000,
        'minibatch_sz': 8,
        'num_replay_updates_per_step': 4,
        'gamma': 0.99,
        'tau': config.tau
    }

    sum_reward = run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)
    wandb.log({"sum_reward": sum_reward})

In [None]:
wandb.agent(sweep_id, train, count=30)