### 1. Import Packages

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from maddpg_agent import Agent

### 2. Examine the State and Action Spaces

In [2]:
env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]


# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


### 3. Create an Instance of the Agent and define help functions to plot the rewards

In [3]:
seed = 2021
## init the agent
num_agents = len(env_info.agents)
agents = Agent(state_size, action_size, seed,num_agents)

#### Help functions for plot

In [4]:
def plot_score_graph(scores_list, avg_scores):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores_list)), scores_list, label = "scores")
    plt.plot(np.arange(len(avg_scores)), avg_scores, label = "100 moving average")
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    ax.hlines(y=.5, xmin=0, xmax=len(scores_list), linewidth=2, color='r')
    plt.legend()
    plt.savefig('scores.png')

### 4. Training the Agent

In [None]:
from collections import namedtuple, deque 
from IPython.display import clear_output
import matplotlib.pyplot as plt
import torch
scores_list = []                    # list containing scores from each episode
avg_scores = []
scores_window = deque(maxlen=100)  # last 100 scores
max_avg_score = 0

eps = 1.0
eps_min = 0.01
eps_decay = 0.95
np.random.seed(seed)
for i_episode in range(1, 5000+1):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    agents.reset()
    while True:
        ## Deciding if will add noise or not
        if np.random.uniform() < eps: 
            actions = agents.act(states, add_noise = True) # select an action (for each agent)
        else:
            actions = agents.act(states, add_noise = False)
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += rewards                                  # update the score (for each agent)
        
        agents.step(states, 
                    actions, 
                    np.expand_dims(np.asarray(rewards), axis=1),
                    next_states, 
                    np.expand_dims(np.asarray(dones), axis=1)) # agent takes a step
        
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    
    eps = eps * eps_decay
    eps = max(eps, eps_min)
        
    scores_window.append(np.max(scores))       # save most recent score(the max from both) in deque list
    scores_list.append(np.max(scores))         # save most recent score
    avg_scores.append(np.mean(scores_window))
        
    
    clear_output(wait=True)
    #print(count)
    print('\rEpisode {}\t Episode Score:{:.2f} \t100 Average Score: {}'.format(i_episode,np.max(scores),
                                                                      np.mean(scores_window)),end="")
    if np.max(scores)>= 0.5:
        with open('message.txt','a') as file:
            file.write('Solved in {} episodes\n'.format(i_episode))
    if i_episode % 20 == 0:
        plot_score_graph(scores_list,avg_scores)
    
    if i_episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
    
    # keep track of the max 100 avg score
    if np.mean(scores_window) > max_avg_score:
        max_avg_score = np.mean(scores_window)
        torch.save(agents.actor_local1.state_dict(), 'checkpoint_actor1.pth')
        torch.save(agents.critic_local1.state_dict(), 'checkpoint_critic1.pth')
        torch.save(agents.actor_local2.state_dict(), 'checkpoint_actor2.pth')
        torch.save(agents.critic_local2.state_dict(), 'checkpoint_critic2.pth')
    print('\nMax Avg Score {}'.format(max_avg_score))    
    


#### Watching the Smart Agent

In [5]:
from collections import namedtuple, deque 
from IPython.display import clear_output
import matplotlib.pyplot as plt
import torch

In [6]:
agents.actor_local1.load_state_dict(torch.load('checkpoint_actor2.pth'))
agents.critic_local1.load_state_dict(torch.load('checkpoint_critic2.pth'))
agents.actor_local2.load_state_dict(torch.load('checkpoint_actor1.pth'))
agents.critic_local2.load_state_dict(torch.load('checkpoint_critic2.pth'))

<All keys matched successfully>

In [None]:
env_info = env.reset(train_mode=False)[brain_name]
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
agents.reset()
while True:   
    actions = agents.act(states,add_noise=False)       # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += rewards                                  # update the score (for each agent)

    if np.any(dones):                                  # exit loop if episode finished
        break

    states = next_states                               # roll over states to next time step

#### Plotting the graph

In [None]:
plot_score_graph(scores_list,avg_scores)

In [None]:
env.close()