## Implementing MADDPG based on full competition phase (1-competition, 0-cooperation)

In [1]:
from env import UnityEnvWrapper
import numpy as np
import random
import torch
from collections import deque
from maddpg import MADDPG
import pandas as pd
import time
import matplotlib.pyplot as plt

In [2]:
env  = UnityEnvWrapper('../../unity/Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# reset the environment
env_info = env._env.reset(train_mode=True)[env.brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = env.brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


# Training

In [4]:
random_seed = 10#round(np.random.rand()*100) #select a random seed value
print(f"Random seed value: {random_seed}")

Random seed value: 10


In [5]:
#Creates 2 agents
agent = MADDPG(num_agents, state_size, action_size, 10)

In [6]:
def train(n_episodes=1000):
    current_score = []
    running_mean = []
    scores_deque = deque(maxlen=100)
    solved = False
    best_avg_score = 0.0
    start_time = time.time()
    for i_episode in range(1, n_episodes+1):
        states = env.reset()
        scores = np.zeros(num_agents)
        while True:
            actions = agent.act(states)
            next_states, rewards, dones = env.step(actions)
            agent.step(states, actions, rewards, next_states, dones)
            scores += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.max(scores))
        current_score.append(np.max(scores))
        running_mean.append(np.mean(scores_deque))
    
        print('\rEpisode {}\tAverage Score: {:.3f}\tCurrent Score: {:.3f}\tLast Best Score: {:.3f}'.format(i_episode, running_mean[-1],current_score[-1], best_avg_score), end="")
        
        if running_mean[-1] >= best_avg_score*1.05:
            agent.saveCheckPoints()
            best_avg_score = running_mean[-1]
            
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tCurrent Score: {:.3f}\tLast Best Score: {:.3f}'.format(i_episode, running_mean[-1], current_score[-1], best_avg_score))
    
        if running_mean[-1]>= 0.5 and not solved:
            solved = True
            print("\x1b[31m\n************ ENVIRONMENT_SOLVED ************\x1b[0m")
            print('\nsolved in {:d} episodes!\It took {:.3f} Minutes to solve the task'.format(i_episode, (time.time()-start_time)/60))
        
        if running_mean[-1] >= 0.75:
            break
            
    return current_score, running_mean

In [7]:
scores = train()

Episode 100	Average Score: 0.001	Current Score: 0.000	Last Best Score: 0.500
Episode 200	Average Score: 0.004	Current Score: 0.000	Last Best Score: 0.500
Episode 300	Average Score: 0.043	Current Score: 0.100	Last Best Score: 0.500
Episode 400	Average Score: 0.065	Current Score: 0.100	Last Best Score: 0.500
Episode 473	Average Score: 0.060	Current Score: 0.100	Last Best Score: 0.500

KeyboardInterrupt: 

In [None]:
# def ddpg(n_episodes=1_000, print_every=100):
#     scores_deque = deque(maxlen=print_every)
#     scores = []
#     for i_episode in range(1, n_episodes+1):
#         env_info = env.reset(train_mode=True)[brain_name]
#         states = env_info.vector_observations                     
#         #Reset the agents noise level
#         agent.reset()
#         #initialize score value calculations for 2 agents
#         scores_episode = np.zeros(num_agents)
        
#         while True:
#             actions = agent.act(states)                   #get action from agent
#             env_info = env.step(actions)[brain_name]      # send actions to the environment
#             next_states = env_info.vector_observations    # get next states   
#             rewards = env_info.rewards                    # get rewards   
#             dones = env_info.local_done                   # see if episodes finished
#             agent.step(states, actions, rewards, next_states, dones) #perform optimization
#             #Append stats
#             scores_episode += rewards
#             states = next_states                                

#             #break if any agents are done           
#             if np.any(dones):
#                 break 
        
#         #calculate intermediate stats
#         score = np.max(scores_episode)
#         scores_deque.append(score)
#         scores.append(score)
        
#         #display current stats
#         print('\rEpisode {}\tAverage Score: {:.4f}\tCurrent Score: {:.4f}\t Max Score: {:.4f}'
#               .format(i_episode, np.mean(scores_deque), score, np.max(scores_deque)), end="")
    
#         # Save checkpoint every 100 episodes
#         if i_episode % print_every == 0:
#             print('\rEpisode {}\tAverage Score: {:.4f}'.format(i_episode, np.mean(scores_deque)))
#             agent.saveCheckPoints(False)
        
#         #break training if env solved    
#         if np.mean(scores_deque) >= 0.5:
#             agent.saveCheckPoints(True)
#             break
        
#     return scores

In [None]:
plt.plot(scores)
plt.xlabel("Episode number")
plt.ylabel("Score (max)")
plt.title("Score vs Episode number for Competitive agents")
plt.show()

In [None]:
#perform rolling average
df = pd.DataFrame(scores)
df.rolling(window=round(0.1*len(scores))).mean().plot()
plt.xlabel("Episode number")
plt.ylabel("Score (max)")
plt.title("Score vs Episode number for Competitive agents")
plt.show()

### Analysis

In [None]:
#TODO: