## Implementing MADDPG based on full competition phase (1-competition, 0-cooperation)

In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random
import torch
from collections import deque
from maddpg import MADDPG
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
env = UnityEnvironment(file_name="../../unity/Soccer.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 2
        Number of External Brains : 2
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: GoalieBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
Unity brain name: StrikerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 6
        Vector Action descriptions: , , , , , 


In [3]:
# print the brain names
print(env.brain_names)

# set the goalie brain
g_brain_name = env.brain_names[0]
g_brain = env.brains[g_brain_name]

# set the striker brain
s_brain_name = env.brain_names[1]
s_brain = env.brains[s_brain_name]

['GoalieBrain', 'StrikerBrain']


## Examine State Space

In [4]:

# reset the environment
env_info = env.reset(train_mode=True)

# number of agents 
num_g_agents = len(env_info[g_brain_name].agents)
print('Number of goalie agents:', num_g_agents)
num_s_agents = len(env_info[s_brain_name].agents)
print('Number of striker agents:', num_s_agents)

# number of actions
g_action_size = g_brain.vector_action_space_size
print('Number of goalie actions:', g_action_size)
s_action_size = s_brain.vector_action_space_size
print('Number of striker actions:', s_action_size)

# examine the state space 
g_states = env_info[g_brain_name].vector_observations
g_state_size = g_states.shape[1]
print('There are {} goalie agents. Each receives a state with length: {}'.format(g_states.shape[0], g_state_size))
s_states = env_info[s_brain_name].vector_observations
s_state_size = s_states.shape[1]
print('There are {} striker agents. Each receives a state with length: {}'.format(s_states.shape[0], s_state_size))

Number of goalie agents: 2
Number of striker agents: 2
Number of goalie actions: 4
Number of striker actions: 6
There are 2 goalie agents. Each receives a state with length: 336
There are 2 striker agents. Each receives a state with length: 336


# Test random agents

In [5]:
# for i in range(2):                                         # play game for 2 episodes
#     env_info = env.reset(train_mode=False)                 # reset the environment    
#     g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
#     s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
#     g_scores = np.zeros(num_g_agents)                      # initialize the score (goalies)
#     s_scores = np.zeros(num_s_agents)                      # initialize the score (strikers)
    
#     while True:
#         # select actions and send to environment
#         g_actions = np.random.randint(g_action_size, size=num_g_agents)
#         s_actions = np.random.randint(s_action_size, size=num_s_agents)
#         actions = dict(zip([g_brain_name, s_brain_name], 
#                            [g_actions, s_actions]))
#         env_info = env.step(actions)                       
        
#         # get next states
#         g_next_states = env_info[g_brain_name].vector_observations         
#         s_next_states = env_info[s_brain_name].vector_observations
        
#         # get reward and update scores
#         g_rewards = env_info[g_brain_name].rewards  
#         s_rewards = env_info[s_brain_name].rewards
#         g_scores += g_rewards
#         s_scores += s_rewards
        
#         # check if episode finished
#         done = np.any(env_info[g_brain_name].local_done)  
        
#         # roll over states to next time step
#         g_states = g_next_states
#         s_states = s_next_states
        
#         # exit loop if episode finished
#         if done:                                           
#             break
#     print('Scores from episode {}: {} (goalies), {} (strikers)'.format(i+1, g_scores, s_scores))

# Training

In [6]:
#Goalie Dimensions
goalie_state_dim = int(env_info[g_brain_name].vector_observations.shape[1])
goalie_action_dim = int(g_brain.vector_action_space_size)
#Striker Dimensions
striker_state_dim = int(env_info[s_brain_name].vector_observations.shape[1])
striker_action_dim = int(s_brain.vector_action_space_size)

In [7]:
random_seed = 10#round(np.random.rand()*100) #select a random seed value
print(f"Random seed value: {random_seed}")

Random seed value: 10


In [8]:
#Creates 2 Goalie agents
goalie_agents = MADDPG(num_g_agents, goalie_state_dim, goalie_action_dim, random_seed)
#Creates 2 Striker agents
striker_agents = MADDPG(num_s_agents, striker_state_dim, striker_action_dim, random_seed)

In [9]:
def ddpg(n_episodes=1_000, print_every=100):
    #Store training results
    g_scores_train = []
    s_scores_train = []
    #rolling scores
    g_scores_deque = deque(maxlen=print_every)
    s_scores_deque = deque(maxlen=print_every)
    for i_episode in range(1, n_episodes+1):
        
        env_info = env.reset(train_mode=False)                 # reset the environment    
        #States
        g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
        s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
        #Scores
        g_scores = np.zeros(num_g_agents)                      # initialize the score (goalies)
        s_scores = np.zeros(num_s_agents)                      # initialize the score (strikers)
        #Reset goalie agents noise level
        goalie_agents.reset()
        #Reset striker agents noise level
        striker_agents.reset()
        
        while True:
            # select actions and send to environment
            g_actions = goalie_agents.act(g_states)
            s_actions = striker_agents.act(s_states)
            #load actions (goalie and striker in a dictionary for Unity brain)
            actions = dict(zip([g_brain_name, s_brain_name], 
                               [g_actions, s_actions]))
            env_info = env.step(actions)                       

            # get next states
            g_next_states = env_info[g_brain_name].vector_observations         
            s_next_states = env_info[s_brain_name].vector_observations

            # get reward and update scores
            g_rewards = env_info[g_brain_name].rewards  
            s_rewards = env_info[s_brain_name].rewards
            g_scores += g_rewards
            s_scores += s_rewards

            # check if episode finished
            g_dones = env_info[g_brain_name].local_done
            s_dones = env_info[s_brain_name].local_done
            
            #perform training step
            goalie_agents.step(g_states, g_actions, g_rewards, g_next_states, g_dones) #perform optimization
            striker_agents.step(s_states, s_actions, s_rewards, s_next_states, s_dones) #perform optimization
            
            # roll over states to next time step
            g_states = g_next_states
            s_states = s_next_states

            # exit loop if episode finished
            if np.any([g_dones, s_dones]):                                           
                break
            
        #calculate intermediate stats
        score_g = np.max(g_scores)
        score_s = np.max(s_scores)
        g_scores_deque.append(score_g)
        s_scores_deque.append(score_s)
        #Append to main array
        g_scores_train.append(score_g)
        s_scores_train.append(score_s)
        
        #display current stats
        print('\rEpisode {}\tAverage Goalie Score: {:.4f}\tCurrent Goalie Score: {:.4f}\t Max Goalie Score: {:.4f}'
              .format(i_episode, np.mean(g_scores_deque), score_g, np.max(g_scores_deque)), end="")

        print('\rEpisode {}\tAverage Striker Score: {:.4f}\tCurrent Striker Score: {:.4f}\t Max Striker Score: {:.4f}'
              .format(i_episode, np.mean(s_scores_deque), score_s, np.max(s_scores_deque)), end="")
    
        # Save checkpoint every 100 episodes
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.4f}'.format(i_episode, np.mean(scores_deque)))
            goalie_agents.saveCheckPoints(False)
            striker_agents.saveCheckPoints(False)
        
        #break training if env solved    
        if np.mean(g_scores_deque) >= 1.5 and np.mean(s_scores_deque) >= 1.5: 
            goalie_agents.saveCheckPoints(False)
            striker_agents.saveCheckPoints(False)
            break
        
    return g_scores_train, s_scores_train

In [10]:
g_scores, s_scores = ddpg()

[1.]
[1.]
[0.]
[0.]


NameError: name 'g_done' is not defined

In [None]:
plt.plot(scores)
plt.xlabel("Episode number")
plt.ylabel("Score")
plt.title("Score vs Episode number for Competitive agents")
plt.show()

In [None]:
#perform rolling average
df = pd.DataFrame(scores)
df.rolling(window=round(0.1*len(scores))).mean().plot()
plt.xlabel("Episode number")
plt.ylabel("Rolling Score")
plt.title("Score vs Episode number for Competitive agents")
plt.show()

# Test

In [None]:
agent = MADDPG(num_agents, state_dim, action_dim, random_seed)
#Load agent
agent.loadCheckPoints(isFinal=True)

In [None]:
def test(n_episodes=30):
    scores = []
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations                     

        #initialize score value calculations for 2 agents
        scores_episode = np.zeros(num_agents)
        
        while True:
            actions = agent.act(states)                   #get action from agent
            env_info = env.step(actions)[brain_name]      # send actions to the environment
            next_states = env_info.vector_observations    # get next states   
            rewards = env_info.rewards                    # get rewards   
            dones = env_info.local_done                   # see if episodes finished

            #Append stats
            scores_episode += rewards
            states = next_states                                

            #break if any agents are done           
            if np.any(dones):
                break 
        
        #calculate intermediate stats
        score = np.max(scores_episode)
        scores.append(score)
        
        #display current stats
        print('Episode {}\tAverage Score: {:.4f}\tCurrent Score: {:.4f}\t Max Score: {:.4f}'
              .format(i_episode, np.mean(scores), score, np.max(scores)))
    
    return scores

In [None]:
test_scores = test()

In [None]:
plt.plot(test_scores)
plt.xlabel("Episode number")
plt.ylabel("Score")
plt.title("Test")
plt.show()

In [None]:
#end.