 # Collaboration and Competition

 ---

 In this notebook, you will learn how to use the Unity ML-Agents environment for the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

 ### 1. Start the Environment

 We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
!pip -q install ./python

[31mtensorflow 1.7.1 has requirement numpy>=1.13.3, but you'll have numpy 1.12.1 which is incompatible.[0m
[31mipython 6.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 3.0.5 which is incompatible.[0m


In [2]:
from unityagents import UnityEnvironment
from collections import namedtuple, deque
import numpy as np
import torch
from ddpg_agent import Multi_Agent

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name] # reset the environment
num_agents = len(env_info.agents) # number of agents 
action_size = brain.vector_action_space_size # size of each action
states = env_info.vector_observations # examine the state space 
state_size = states.shape[1]

print('Size of each action:', action_size)
print('Number of agents:', num_agents)
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
print('The state for the second agent looks like:', states[1])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Size of each action: 2
Number of agents: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.         -6.65278625 -1.5        -0.          0.
  6.83172083  6.         -0.          0.        ]
The state for the second agent looks like: [ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.         -6.4669857  -1.5         0.          0.
 -6.83172083  6.          0.          0.        ]


In [4]:
def exercise(n_episodes=10000, solve_score=0.5):
    multi_agent = Multi_Agent(state_size, action_size, num_agents=2, GAMMA=0.99 , TAU=6e-2,\
                              EPS_S=7.0, EPS_E=0.01, EPS_D=0.997, BUF_S=int(1e6), BCH_S=128,\
                              LR_A=1e-3, LR_C=1e-3)
    all_scores = []
    scores_window = deque(maxlen=100)
    av_scores = []

    for i_episode in range(n_episodes):
        env_info = env.reset(train_mode=True)[brain_name]
        states = np.reshape(env_info.vector_observations, (1, state_size*multi_agent.num_agents))
        multi_agent.reset()
        episode_scores = np.zeros(multi_agent.num_agents)

        i_step = 0
        while True:
            actions = multi_agent.act(states)                                                     # Calculate agent actions
            env_info = env.step(actions)[brain_name]                                  # Send both agents' actions to the environment
            next_states = np.reshape(env_info.vector_observations, (1, state_size*multi_agent.num_agents))         # Combine the agent states into one state space
            rewards = env_info.rewards                                                # Get rewards
            done = env_info.local_done                                                # Get completion status of episode
            multi_agent.step(states, actions, rewards, next_states, done)                         # Learning steps for each agent

            episode_scores += rewards                                                         # Update the agent scores
            states = next_states                                                      # Roll the state forward
            i_step += 1
            if np.any(done):                                                          # Exit if episode completed
                break

        multi_agent.decay()

        all_scores.append(np.max(episode_scores))
        scores_window.append(np.max(episode_scores))
        av_scores.append(np.mean(scores_window))

        print('\rEpisode {}\tAverage Score: {:.3f}\tScore: {:.3f}'.format(i_episode, np.mean(scores_window), np.max(episode_scores)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)))
            #if i_episode % 10 == 0:
            #print('Episode {}\tMax Reward: {:.3f}\tAverage Reward: {:.3f}'.format(i_episode, np.max(episode_scores), np.mean(scores_window)))

        if np.mean(scores_window) >= solve_score:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode-100, np.mean(scores_window)))
            multi_agent.save()
            print ("*** FINISHED TRAINING ***")
            break

    return all_scores, av_scores

In [5]:
ascores, av_scores = exercise(solve_score=0.5);


Episode 0	Average Score: 0.000	Score: 0.000
Episode 100	Average Score: 0.012	Score: 0.000
Episode 200	Average Score: 0.024	Score: 0.000
Episode 300	Average Score: 0.035	Score: 0.000
Episode 400	Average Score: 0.040	Score: 0.000
Episode 500	Average Score: 0.054	Score: 0.090
Episode 600	Average Score: 0.085	Score: 0.100
Episode 700	Average Score: 0.105	Score: 0.100
Episode 800	Average Score: 0.336	Score: 1.600
Episode 844	Average Score: 0.505	Score: 2.600
Environment solved in 744 episodes!	Average Score: 0.505
*** FINISHED TRAINING ***


 ### 4. Plot training episode scores


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(ascores)+1), ascores)
plt.plot(np.arange(1, len(av_scores)+1), av_scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


 ### 5. Watch trained agents play tennis


In [19]:
def play (num_episodes=1000):
    # Set environment to evalulation mode
    players=Multi_Agent(state_size, action_size, num_agents=2, GAMMA=0.99 , TAU=6e-2,\
                              EPS_S=7.0, EPS_E=0.01, EPS_D=0.997, BUF_S=int(1e6), BCH_S=128,\
                              LR_A=1e-3, LR_C=1e-3)
    players.agents[0].actor_local.load_state_dict(torch.load('checkpoint_actor_0.pth'))
    players.agents[0].critic_local.load_state_dict(torch.load('checkpoint_critic_0.pth'))
    players.agents[1].actor_local.load_state_dict(torch.load('checkpoint_actor_1.pth'))
    players.agents[1].critic_local.load_state_dict(torch.load('checkpoint_critic_1.pth'))

    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    states = np.reshape(states, (1, state_size * 2))

    for i in range(num_episodes):
        actions = players.act(states, add_noise=False)             # select an action (for each agent)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = np.reshape(env_info.vector_observations, (1, state_size * 2))        # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break



In [21]:

play()


 When finished, you can close the environment.

In [None]:
env.close()
