## Watch Agents!

### 1. Init Red and Blue  Agents

In [1]:
import numpy as np
import torch

import sys
import os

sys.path.append(os.path.abspath('python/'))

from unityagents import UnityEnvironment
# MADDPG wrapper
from maddpg_agent import maddpg_agent

maddpg = maddpg_agent()
env = UnityEnvironment(seed=0, file_name="Tennis_Windows_x86_64/Tennis.app")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]

# number of agents
num_agents = len(env_info.agents)                      

# size of each action
action_size = brain.vector_action_space_size

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


### 2. Play Before Training

In [2]:
for i in range(1, 8):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment         
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        
        if np.any(dones):                                  # exit loop if episode finished
            break
    
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

Score (max over agents) from episode 1: 0.0
Score (max over agents) from episode 2: 0.09000000171363354
Score (max over agents) from episode 3: 0.0
Score (max over agents) from episode 4: 0.0
Score (max over agents) from episode 5: 0.0
Score (max over agents) from episode 6: 0.0
Score (max over agents) from episode 7: 0.0


### 3. Load weights and Play 

In [3]:
def load(dir):    
        for i in range(num_agents):
            maddpg.agents[i].actor_local.load_state_dict(
                torch.load(os.path.join(dir, 'checkpoint_actor_{}.pth'.format(i))) )
            maddpg.agents[i].critic_local.load_state_dict(
                torch.load(os.path.join(dir, 'checkpoint_critic_{}.pth'.format(i))) )


def play(maddpg, env, num_games=11):
    """Tests the training results by having both agents play a match.
    Params
    ======
        maddpg (MADDPG): instance of MADDPG wrapper class
        env (UnityEnvironment): instance of Unity environment for testing
        num_games (int): number of games to be played
    """
    
    print("Agent #0: Red racket")
    print("Agent #1: Blue racket")
    print("---------------------")

    game_scores = [0 for _ in range(num_agents)]

    # Environment information
    brain_name = env.brain_names[0]

    for i_episode in range(1, num_games+1):
        env_info = env.reset(train_mode=False)[brain_name]   
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        t_step = 0
        
        while True:
            actions = maddpg.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            scores += rewards
            dones = env_info.local_done
            t_step += 1

            if np.any(dones):
                winner = np.argmax(scores)
                game_scores[winner] += 1
                if (t_step > 0):
                    t_step -= 1
                print('Game: {}, partial score: {},  Score #0: {:.2f}, Score #1: {:.2f}, Timesteps: {} '.
                       format(i_episode, game_scores, scores[0], scores[1], t_step))
                break

            states = next_states

    print("---------------------")
    print("Winner: Agent #{}".format(np.argmax(game_scores)))
  

### 4. Play after training in 1600 games

In [4]:
dir_chkpoints = 'dir_chk_1600b_episodes'
load(dir_chkpoints)
play(maddpg, env, num_games=5)

Agent #0: Red racket
Agent #1: Blue racket
---------------------
Game: 1, partial score: [1, 0],  Score #0: 2.60, Score #1: 2.60, Timesteps: 1000 
Game: 2, partial score: [2, 0],  Score #0: 2.70, Score #1: 2.60, Timesteps: 1000 
Game: 3, partial score: [3, 0],  Score #0: 0.20, Score #1: 0.19, Timesteps: 104 
Game: 4, partial score: [3, 1],  Score #0: 0.09, Score #1: 0.10, Timesteps: 50 
Game: 5, partial score: [4, 1],  Score #0: 2.60, Score #1: 2.60, Timesteps: 1000 
---------------------
Winner: Agent #0


### 5. Play after training in 1700 games

In [5]:
dir_chkpoints = 'dir_chk_1700d_episodes'
load(dir_chkpoints)
play(maddpg, env, num_games=5)

Agent #0: Red racket
Agent #1: Blue racket
---------------------
Game: 1, partial score: [1, 0],  Score #0: 2.70, Score #1: 2.60, Timesteps: 1000 
Game: 2, partial score: [2, 0],  Score #0: 2.60, Score #1: 2.60, Timesteps: 1000 
Game: 3, partial score: [3, 0],  Score #0: 0.10, Score #1: 0.09, Timesteps: 37 
Game: 4, partial score: [3, 1],  Score #0: -0.01, Score #1: 0.10, Timesteps: 30 
Game: 5, partial score: [4, 1],  Score #0: 2.60, Score #1: 2.60, Timesteps: 1000 
---------------------
Winner: Agent #0


In [6]:
env.close()