In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random
import torch
from collections import deque
from MADDPG import MADDPG
import matplotlib.pyplot as plt

In [2]:
env = UnityEnvironment(file_name="../unity/Tennis.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

## Examine state space

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


## Testing random actions

In [5]:
# for i in range(1, 6):                                      # play game for 5 episodes
#     env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
#     states = env_info.vector_observations                  # get the current state (for each agent)
#     scores = np.zeros(num_agents)                          # initialize the score (for each agent)
#     while True:
#         actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
#         actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
#         env_info = env.step(actions)[brain_name]           # send all actions to tne environment
#         next_states = env_info.vector_observations         # get next state (for each agent)
#         rewards = env_info.rewards                         # get reward (for each agent)
#         dones = env_info.local_done                        # see if episode finished
#         scores += env_info.rewards                         # update the score (for each agent)
#         states = next_states                               # roll over states to next time step
#         if np.any(dones):                                  # exit loop if episode finished
#             break
#     print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

# Training

In [6]:
#Create 2 agents (MA)
agents = MADDPG(state_size, action_size, random_seed = 2)

In [7]:
#Define Constants
n_episodes = 15000
rand_episodes = 100 #number of episodes to encourage exploration
scores_list = []
scores_window = deque(maxlen=100)

In [8]:
for i_episode in range(1, n_episodes + 1):
    env_info = env.reset(train_mode=True)[brain_name]         
    states = env_info.vector_observations                  
    scores = np.zeros(num_agents)  
    
    time_step = 0
    while True:
        
        time_step += 1 #Update time step counter

        # Random actions for first 1200 episodes to encourage exploration
        if i_episode < rand_episodes:
            actions = agents.act(states, rand = True)
        else: 
            print("here")
            actions = agents.act(states)
        
        env_info = env.step(actions)[brain_name]           # send all actions to the environment
        next_states = env_info.vector_observations         # get next state (for each agent)              
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        
        #Learn from experiences
        agents.step(time_step, states, actions, rewards, next_states, dones)
        
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                 # exit loop if episode finished
            break
    
    scores_list.append(np.max(scores))
    scores_window.append(np.max(scores))
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.max(scores)), end="")
    
    if i_episode % 100 == 0:
        torch.save(agents.agent1.actor_local.state_dict(), 'checkpoint_agent1_actor.pth')
        torch.save(agents.agent1.critic_local.state_dict(), 'checkpoint_agent1_critic.pth')
        
        torch.save(agents.agent2.actor_local.state_dict(), 'checkpoint_agent2_actor.pth')
        torch.save(agents.agent2.critic_local.state_dict(), 'checkpoint_agent2_critic.pth')
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    if np.mean(scores_window) > 0.5:
        torch.save(agents.agent1.actor_local.state_dict(), 'checkpoint_agent1_actor.pth')
        torch.save(agents.agent1.critic_local.state_dict(), 'checkpoint_agent1_critic.pth')
        
        torch.save(agents.agent2.actor_local.state_dict(), 'checkpoint_agent2_actor.pth')
        torch.save(agents.agent2.critic_local.state_dict(), 'checkpoint_agent2_critic.pth')
        
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        break

Episode 14	Average Score: 0.00tensor(-0.0035, grad_fn=<NegBackward>)
tensor(-0.0034, grad_fn=<NegBackward>)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores_list)), scores_list)
plt.ylabel('Score')
plt.xlabel('Episode Number')
plt.show()