# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

## Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

# Train DDPG agent on Reacher
## Imports and environment setup

In [1]:
from ddpg_agent import Agent 
from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time

env = UnityEnvironment(file_name="Reacher_Linux/Reacher.x86")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]



INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 1
Size of each action: 4


In [None]:
load = False    # Load saved model
filename = "DDPG_Score_1.6.pth"
n_episodes = 1800
max_t = 2000
print_steps = 100
save_score = 30
# Create DDPG agent
seed = 4
agent = Agent(state_size, action_size, seed)

# Load model
if load:
    agent.load_model(filename)
    
# get the current state (for each agent) so far only one agent is used
states = env_info.vector_observations
# initialize the score (for each agent)
scores = []

start = time.time()
for i_episode in range(1, n_episodes+1):
    state = env.reset()
    sigma_noise = agent.reset()    # Reset process noise and reduce noise power
    score = np.zeros(num_agents)
    for t in range(max_t):
        # select an action (for each agent)
        actions = agent.act(states).reshape(
            num_agents, action_size)
        # all actions between -1 and 1
        actions = np.clip(actions, -1, 1)
        # send all actions to tne environment
        env_info = env.step(actions)[brain_name]
        # get next state (for each agent)
        next_states = env_info.vector_observations
        # get reward (for each agent)
        rewards = env_info.rewards
        dones = env_info.local_done                        # see if episode finished

        # Do agent step to save experience and train DDPG actor-critic networks
        agent.step(states, actions, rewards, next_states, dones)

        # update the score (for each agent)
        score += env_info.rewards
        #scores.append(env_info.rewards
        # roll over states to next time step
        states = next_states
        # exit loop if episode finished
        if np.any(dones):
            break
            
    scores.append(np.mean(score))
    print('\rEpisode {}\tScore: {:.2f} \tSigma_noise: {:.3f}'.format(i_episode, score[0], sigma_noise), end="")
    
    if i_episode % print_steps == 0:
        end = time.time()    
        mean_score = np.mean(scores[-print_steps:])
        print('\rEpisode {}\tAverage Score: {:.2f} \tTime/episode: {:.2f}'.format(i_episode, mean_score, (end-start)/print_steps, end=""))
        start = end
        if mean_score >save_score:
            agent.save_model("DDPG_Score_{:.1f}.pth".format(mean_score))
            save_score = mean_score + 1      
            print('Agent saved! file="DDPG_Score_{:.1f}.pth"'.format(mean_score))
        
    
#print('Total score (averaged over agents) this episode: {}'.format(np.mean(score)))

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Test trained agent

In [2]:
load = True
load_filename = "DDPG_Score_37.8.pth"
if load:  # Run environment setup before loading agent
    agent = Agent(state_size, action_size, 1)
    agent.load_model(load_filename)

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
n=0
while True:
    n+=1
    # select an action (for each agent)
    actions = agent.act(states).reshape(num_agents, action_size)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        print("time steps: ", n)
        break
print('Total score (averaged over agents) this episode: {:.2f}'.format(np.mean(scores)))

time steps:  1001
Total score (averaged over agents) this episode: 31.08


In [None]:
env.close()