In [None]:
## This can leave open processes if you don't keep track of them, be sure to clean up after

In [None]:
import numpy as np
import torch
import gym
import pybullet_envs
import os
import time

import utils
import TD3
from numpngw import write_apng

In [None]:
from gym.envs.registration import registry, make, spec


def register(id, *args, **kvargs):
  if id in registry.env_specs:
    return
  else:
    return gym.envs.registration.register(id, *args, **kvargs)

In [None]:
register(id='MyAntBulletEnv-v0',
         entry_point='override_ant_random_points:MyAntBulletEnv',
         max_episode_steps=2000,
         reward_threshold=2500.0)

In [None]:
def eval_policy_render(policy, env_name, seed, eval_episodes=5):
    eval_env = gym.make(env_name, render=True)
    eval_env.seed(seed + 100)

    avg_reward = 0.
    for i in range(eval_episodes):
        r = np.linalg.norm([20,20])
        rand_deg = np.random.randint(0,360) # degrees here for reader clarity, rather than directly in 2pi

        rand_x = r*np.cos(np.pi/180 * rand_deg)
        rand_y = r*np.sin(np.pi/180 * rand_deg)

        eval_env.robot.walk_target_x = rand_x
        eval_env.robot.walk_target_y = rand_y
        state, done = eval_env.reset(), False
        images = [eval_env.render('rgb_array')]
        time_step_counter = 1
        while not done:
            if time_step_counter % 500 == 0:
                rand_deg = np.random.randint(0,360) # degrees here for reader clarity, rather than directly in 2pi
                rand_x = r*np.cos(np.pi/180 * rand_deg)
                rand_y = r*np.sin(np.pi/180 * rand_deg)
                eval_env.robot.walk_target_x = rand_x
                eval_env.robot.walk_target_y = rand_y
            time.sleep(1. / 60.)
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward
            images.append(eval_env.render('rgb_array'))
            time_step_counter = time_step_counter + 1

        print(f'Saving animation: anim_{i}.png, lenght: {len(images)} frames.')
        #write_apng(f'anim_{i}.png', images[::2], delay=50)  #uncomment this line to save animations
        print('Save file complete')
            
    avg_reward /= eval_episodes
    return avg_reward

In [None]:
def load_policy(env_name_var):
    args = {
            "policy" : "TD3",                  # Policy name (TD3, DDPG or OurDDPG)
            "env" : env_name_var,              # OpenAI gym environment name
            "seed" : 0,                        # Sets Gym, PyTorch and Numpy seeds
            "start_timesteps" : 25e3,          # Time steps initial random policy is used
            "eval_freq" : 5e3,                 # How often (time steps) we evaluate
            "max_timesteps" : 2e6,             # Max time steps to run environment
            "expl_noise" : 0.1,                # Std of Gaussian exploration noise
            "batch_size" : 256,                # Batch size for both actor and critic
            "discount" : 0.99,                 # Discount factor
            "tau" : 0.007,                     # Target network update rate
            "policy_noise" : 0.2,              # Noise added to target policy during critic update
            "noise_clip" : 0.5,                # Range to clip target policy noise
            "policy_freq" : 2,                 # Frequency of delayed policy updates
            "save_model" : "store_true",       # Save model and optimizer parameters
            "load_model" : "default",           # Model load file name, "" doesn't load, "default" uses file_name
           }

    file_name = f"{args['policy']}_{args['env']}_{args['seed']}_{args['tau']}"
    print("---------------------------------------")
    print(f"Policy: {args['policy']}, Env: {args['env']}, Seed: {args['seed']}")
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args['save_model'] and not os.path.exists("./models"):
        os.makedirs("./models")

    env = gym.make(args['env'])

    # Set seeds
    env.seed(args['seed'])
    env.action_space.seed(args['seed'])
    torch.manual_seed(args['seed'])
    np.random.seed(args['seed'])

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args['discount'],
        "tau": args['tau'],
    }

    # Initialize policy
    if args['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args['policy_noise'] * max_action
        kwargs["noise_clip"] = args['noise_clip'] * max_action
        kwargs["policy_freq"] = args['policy_freq']
        policy = TD3.TD3(**kwargs)

    if args['load_model'] != "":
        policy_file = file_name if args['load_model'] == "default" else args['load_model']
        policy.load(f"./models/{policy_file}")

    return policy


In [None]:
policy = load_policy("MyAntBulletEnv-v0")
eval_policy_render(policy, "MyAntBulletEnv-v0", 0)