In [1]:
import numpy as np
import torch
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_reacher import Agent
from unityagents import UnityEnvironment

# Load Reacher env with 20 agents
env = UnityEnvironment(file_name=r"Reacher_Windows_x86_64\Reacher.exe")

brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1] 

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [2]:
def ddpg(agent, n_episodes=120, max_t=1000, training_mode=True):
    
    env_info = env.reset(train_mode=training_mode)[brain_name]      # reset the environment    

    total_scores = []
    
    for episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=training_mode)[brain_name]       # reset the environment    
        states = env_info.vector_observations                   # get the current state (for each agent)
        scores = np.zeros(num_agents)                           # initialize the score (for each agent)
        agent.reset()
        
        for _ in range(max_t):
            actions = agent.act(states)

            env_info = env.step(actions)[brain_name]            # send all actions to the environment
            
            rewards = env_info.rewards
            next_states = env_info.vector_observations
            dones = env_info.local_done
            
            agent.step(states, actions, rewards, next_states, dones) # send actions to the agent
            
            scores += env_info.rewards                          # update the score (for each agent)
            states = next_states                                # roll over states to next time step
            
            if np.any(dones):                                   # exit loop if episode finished
                break
        
        total_scores.append([np.mean(scores)])
        avg = np.mean(np.array(total_scores).T[0][-100:])
        print(f"Episode {episode}\t Average 100: {avg:.2f}\tMean rewards: {np.mean(scores):.2f})")

        if training_mode and np.mean(scores) > 35:
            torch.save(agent.actor_local.state_dict(), f'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), f'checkpoint_critic.pth')
    
    return total_scores, np.mean(np.array(total_scores).T[0][-100:])

In [None]:
# normal replay buffer
agent = Agent(state_size, action_size, None, False, 0)
scores, score = ddpg(agent, 100)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# priority replay buffer
agent = Agent(state_size, action_size, None, True, 0)
scores, score = ddpg(agent, 2)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# search for optimal hyper parameters
import optuna

def objective(trial):
    buffer_sizes = trial.suggest_int("buffer_sizes", int(1e5), int(1e6))
    batch_sizes = trial.suggest_int("batch_sizes", 64, 512, step=16)
    gammas = trial.suggest_float("gammas", 0.98, 0.999)
    taus = trial.suggest_loguniform("taus", 1e-3, 5e-3)
    lr_critic = trial.suggest_loguniform('lr_critic', 1e-4, 3e-4)
    lr_actor = trial.suggest_loguniform('lr_actor', 1e-4, 3e-4)
    weight_decays = trial.suggest_float("weight_decays", 0, 1e-3)
    theta = trial.suggest_loguniform('theta', 0.13, 0.26)
    sigma = trial.suggest_loguniform('sigma', 0.18, 0.22)
    mu = trial.suggest_float("mu", -0.01, 0.01)

    params = buffer_sizes, batch_sizes, gammas, taus, lr_actor, lr_critic, weight_decays, 400, 300, 256, 256, 128, mu, theta, sigma
  
    agent = Agent(state_size=33, action_size=4, hyper_params=params, PR=False, random_seed=0)
    _, score  = ddpg(agent, 40, 1000)
    return score  # Return the metric to optimize

pruner = optuna.pruners.ThresholdPruner(lower=10.0)

study = optuna.create_study(direction="maximize",pruner=pruner)
study.optimize(objective, n_trials=25)

print("Best hyperparameters: ", study.best_params)


In [None]:
# load actor with saved weights, run ddpg in evaluation mode

agent = Agent(state_size, action_size, None, False, 0)
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
scores, score = ddpg(agent, 2, False)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()