# Continuous Control - Unity Reacher Environment

In [1]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from agent import Agent
from unityagents import UnityEnvironment
from utils import timeit, count_parameters

In [2]:
# CONFIGURATION
EPISODES = 300
PRINT_EVERY = 100
# ENV = 'Reacher.app'
ENV = 'Reacher20.app'

In [3]:
# Test World
# ----------

env = UnityEnvironment(file_name=ENV)

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)

env.close()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## [Reacher Envionment](https://youtu.be/2N9EoF6pQyE)


* Double-jointed arm which can move to target locations - Environment could be extend to manipulate several arms at the same time
* Goal: The agents must move it's hand to the goal location, and keep it there.
* Agents: The environment contains 20 agent linked to a single Brain.
* Agent Reward Function (independent):
  * +0.1 Each step agent's hand is in goal location.
* Brains: One Brain with the following observation/action space.
  * Vector Observation space: 26 variables corresponding to position, rotation,
    velocity, and angular velocities of the two arm Rigidbodies.
  * Vector Action space: (Continuous) Size of 4, corresponding to torque
    applicable to two joints.
  * Visual Observations: None.
* Benchmark Mean Reward: 30

One random Agent 
![1agent][agent1]

20 random Agents
![20agents][agent20]

[agent1]: https://github.com/PabloRR100/Reinforcement-Learning/blob/master/rlnd/p2_continuous-control/images/1_random_agents.gif?raw=true
[agent20]: https://github.com/PabloRR100/Reinforcement-Learning/blob/master/rlnd/p2_continuous-control/images/20_random_agents.gif?raw=true

In [4]:
# Test Agent
# ----------

state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size
agent = Agent(num_agents=num_agents, state_size=state_size, action_size=action_size)

print('Capacity of the Actor (# of parameters): ', count_parameters(agent.actor_local))
print('Capacity of the Critic (# of parameters): ', count_parameters(agent.critic_local))

Capacity of the Actor (# of parameters):  9732
Capacity of the Critic (# of parameters):  108545


In [3]:
# Training
# --------

@timeit
def train(env, n_episodes=EPISODES, print_every=PRINT_EVERY):
    
    print('Loading environmnet...\n')
    env = UnityEnvironment(file_name=ENV)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    
    
    print('Loading agent...\n')
    num_agents = len(env_info.agents)
    state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size
    agent = Agent(num_agents=num_agents, state_size=state_size, action_size=action_size)
    print('Capacity of the Actor (# of parameters): ', count_parameters(agent.actor_local))
    print('Capacity of the Critic (# of parameters): ', count_parameters(agent.critic_local))

    
    last_100_mean = []
    scores_global = []
    scores_concur = deque(maxlen=print_every)
    
    print('Initializing training...\n')
    for e in range(1, n_episodes+1):
        
        j = 0
        # Initialize Episode
        scores = np.zeros(num_agents)
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations                  # get the current state (for each agent)
        
        agent.reset()
        
        while True:
            
            # Act in the enviromnet
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]          
            
            # Observe result of the action
            next_states = env_info.vector_observations         
            rewards = env_info.rewards                         
            dones = env_info.local_done   
                     
            # Store score result
            scores += env_info.rewards                         
            
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)
            
            if j % print_every == 0:
                print('\rStep {}\tAverage Score: {:.2f}'.format(j, np.mean(scores)), end="")
            
            if np.any(dones):                                  # End of the episode
                break
            
            states = next_states                               # roll over states to next time step
            j += 1
            
        agent.sampleandlearn()
        
        score = np.mean(scores)
        scores_concur.append(score)
        scores_global.append(score)
        print('\rEpisode {}, Mean last 100 scores: {:.2f}, Mean current score: {:.2f}, \n'\
              .format(e, np.mean(scores_concur), score))
        
        if np.mean(scores_concur) > last_100_mean:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_{}.pth'.format(e))
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_{}.pth'.format(e))
            
        last_100_mean = np.mean(scores_concur)
    
    print('Closing envionment...\n')
    env.close()
    return agent, scores_global

In [4]:
# Init Training
# -------------

agent, scores = train(ENV, EPISODES, PRINT_EVERY)

Loading environmnet...



INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Loading agent...

Capacity of the Actor (# of parameters):  9732
Capacity of the Critic (# of parameters):  108545
Initializing training...

Step 0	Average Score: 0.00



Step 600	Average Score: 0.16

KeyboardInterrupt: 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()