In [None]:
import numpy as np
from utils import *
from tqdm import tqdm

import os 
import shutil

import wandb 
import warnings

import gym 
from gym.wrappers.monitoring.video_recorder import VideoRecorder

warnings.filterwarnings("ignore")
os.environ["WANDB_NOTEBOOK_NAME"] = "run.ipynb"

In [None]:
experiment_parameters = {
    "num_runs" : 10,
    "num_episodes" : 1000,
    # OpenAI Gym environments allow for a timestep limit timeout, causing episodes to end after some number of timesteps
    "timeout" : 500
}

environment_parameters = {}
current_env = LunarLanderEnvironment

agent_parameters = {
    'network_config': {
        'state_dim': 8,
        'num_hidden_units': 256,
        'num_actions': 4
    },
    'optimizer_config': {
        'step_size': 1e-3,
        'beta_m': 0.9, 
        'beta_v': 0.999,
        'epsilon': 1e-8
    },
    'replay_buffer_size': 50000,
    'minibatch_sz': 8,
    'num_replay_updates_per_step': 4,
    'gamma': 0.99,
    'tau': 0.001
}

current_agent = Agent

In [None]:
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    rl_glue = RLGlue(environment, agent)
        
    # save sum of reward at the end of each episode
    agent_sum_reward = np.zeros((experiment_parameters["num_runs"], 
                                 experiment_parameters["num_episodes"]))

    env_info = {}
    agent_info = agent_parameters

    for run in range(1, experiment_parameters["num_runs"]+1):
        wandb.init(project="RL_Lunar_Lander")
        agent_info["seed"] = run
        agent_info["network_config"]["seed"] = run
        env_info["seed"] = run

        rl_glue.rl_init(agent_info, env_info)
        
        for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)):

            # run episode
            rl_glue.rl_episode(experiment_parameters["timeout"])
            
            # get cumulative reward
            episode_reward = rl_glue.rl_agent_message("get_sum_reward")
            agent_sum_reward[run-1, episode-1] = episode_reward
            wandb.log({"episode_reward": episode_reward})

    save_name = "{}".format(rl_glue.agent.name)
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')

    return rl_glue

In [None]:
rl_glue = run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)

In [None]:
env = gym.make("LunarLander-v2")

# get trained agent
agent = rl_glue.agent

for i in range(10):
    video = VideoRecorder(env, f"logs/video_{i}.mp4")
    observation = env.reset()

    for j in range(1000):
        env.render()
        video.capture_frame()
        action = agent.policy(observation)
        observation, reward, done, info = env.step(action)

        if done:
            break

    video.close()

env.close()