## Implementing MADDPG based on full competition phase (1-competition, 0-cooperation)

In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random
import torch
from collections import deque
from agent import Agent
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
env = UnityEnvironment(file_name="../../unity/Tennis.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

## Examine state space

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


## Testing random actions

In [5]:
# for i in range(1, 6):                                      # play game for 5 episodes
#     env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
#     states = env_info.vector_observations                  # get the current state (for each agent)
#     scores = np.zeros(num_agents)                          # initialize the score (for each agent)
#     step_counter = 0
#     while True:
#         step_counter += 1
#         actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
#         actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
#         env_info = env.step(actions)[brain_name]           # send all actions to tne environment
#         next_states = env_info.vector_observations         # get next state (for each agent)
#         rewards = env_info.rewards                         # get reward (for each agent)
#         dones = env_info.local_done                        # see if episode finished
#         scores += env_info.rewards                         # update the score (for each agent)
#         states = next_states                               # roll over states to next time step
#         if np.any(dones):                                  # exit loop if episode finished
#             print(f"Episode: {i}, count: {step_counter}")
#             break
#     print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

# Training

In [6]:
state_dim = int(env_info.vector_observations.shape[1])
action_dim = int(brain.vector_action_space_size)

In [10]:
random_seed = round(np.random.rand()*100) #select a random seed value
print(f"Random seed value: {random_seed}")

Random seed value: 76


In [12]:
#Utils
def unity_step_wrap(actions):
    env_info = env.step(actions)[brain_name]        # send the action to the environment
    next_states = env_info.vector_observations      # get the next state
    rewards = env_info.rewards                      # get the reward
    dones = env_info.local_done                     # see if episode has finished
    return (next_states, rewards, dones)

In [13]:
agent = Agent(state_size=state_size, action_size=action_size, random_seed=2)

In [18]:
def ddpg(n_episodes=1000, max_t=1000, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations                     
        
        agent.reset()
        scores_episode = np.zeros(num_agents)
        
        for t in range(max_t):
            actions = agent.act(states)
            next_states, rewards, dones = unity_step_wrap(actions)
            
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.step(t, state, action, reward, next_state, done)
                
            scores_episode += rewards
            states = next_states                                
                       
            if np.any(dones):
                break 
        
        score = np.max(scores_episode)
        scores_deque.append(score)
        scores.append(score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
    
        # Save checkpoint
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            torch.save(Agent.actor_local.state_dict(), '../models/checkpoint/actor.pth')
            torch.save(Agent.critic_local.state_dict(), '../models/checkpoint/critic.pth')
            
        if np.mean(scores_deque) >= 0.5:
            break
        
    return scores

In [None]:
scores = ddpg()

Episode 61	Average Score: 0.00

In [None]:
plt.plot(scores)
plt.xlabel("Episode number")
plt.ylabel("Score (max)")
plt.title("Score vs Episode number for Competitive agents")
plt.show()

In [None]:
#perform rolling average
df = pd.DataFrame(scores)
df.rolling(window=round(0.1*len(scores))).mean().plot()
plt.xlabel("Episode number")
plt.ylabel("Score (max)")
plt.title("Score vs Episode number for Competitive agents")
plt.show()