# Agent's Performance 
# Multi-Agent Deep Deterministic Policy Gradients (MADDPG)
---
Notebook, addapted from 
https://raw.githubusercontent.com/udacity/deep-reinforcement-learning/master/ddpg-bipedal/DDPG.ipynb
training DDPG with OpenAI Gym's BipedalWalker-v2 environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from maddpg import MADDPG     

### 2. Instantiate the Environment and Agent

In [2]:
from unityagents import UnityEnvironment
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
#no_graphics=True
no_graphics=False
env = UnityEnvironment(file_name='C:\EigeneLokaleDaten\DeepRL\Value-based-methods\p3_Soccer\Soccer_Windows_x86_64\Soccer.exe',no_graphics=no_graphics)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 2
        Number of External Brains : 2
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: GoalieBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
Unity brain name: StrikerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 6
        Vector Action descriptions: , , , , , 


In [4]:
# follow nomucaltur: https://github.com/udacity/deep-reinforcement-learning/blob/master/p3_collab-compet/Soccer.ipynb
# set the goalie brain
g_brain_name = env.brain_names[0]
g_brain = env.brains[g_brain_name]

# set the striker brain
s_brain_name = env.brain_names[1]
s_brain = env.brains[s_brain_name]

In [5]:
# reset the environment
env_info = env.reset(train_mode=True)

# number of agents 
num_g_agents = len(env_info[g_brain_name].agents)
print('Number of goalie agents:', num_g_agents)
num_s_agents = len(env_info[s_brain_name].agents)
print('Number of striker agents:', num_s_agents)

# number of actions
g_action_size = g_brain.vector_action_space_size
print('Number of goalie actions:', g_action_size)
s_action_size = s_brain.vector_action_space_size
print('Number of striker actions:', s_action_size)

# examine the state space 
g_states = env_info[g_brain_name].vector_observations
g_state_size = g_states.shape[1]
print('There are {} goalie agents. Each receives a state with length: {}'.format(g_states.shape[0], g_state_size))
s_states = env_info[s_brain_name].vector_observations
s_state_size = s_states.shape[1]
print('There are {} striker agents. Each receives a state with length: {}'.format(s_states.shape[0], s_state_size))

Number of goalie agents: 2
Number of striker agents: 2
Number of goalie actions: 4
Number of striker actions: 6
There are 2 goalie agents. Each receives a state with length: 336
There are 2 striker agents. Each receives a state with length: 336


# Load the Network Weights and run Agent

In [6]:
# initialize policy
maddpg = MADDPG()

'''
# load saved weights:
save_dict_list = torch.load('.\model_dir\Run3_reduced_only_actor_episode-720.pt')
#save_dict_list = torch.load('.\model_dir\Run1_reduced_only_actor_episode-1450.pt')
for i in range(2):
    maddpg.maddpg_agent[i].actor.load_state_dict(save_dict_list[i]['actor_params'])


# load saved weights:
save_dict_list1 = torch.load('.\model_dir\Run3_reduced_only_actor_episode-720.pt')
save_dict_list2 = torch.load('.\model_dir\Run1_reduced_only_actor_episode-1450.pt')
maddpg.maddpg_agent[0].actor.load_state_dict(save_dict_list1[0]['actor_params'])
maddpg.maddpg_agent[1].actor.load_state_dict(save_dict_list2[1]['actor_params'])
''' 


save_dict_list = torch.load('.\model_dir\StrikerOnly_Run3_reduced_only_actor_episode-1410.pt')
#save_dict_list = torch.load('.\model_dir\Run1_reduced_only_actor_episode-1450.pt')
for i in range(2):
    maddpg.maddpg_agent[i].actor.load_state_dict(save_dict_list[i]['actor_params'])
    

init OUNoise with dim= 1
init OUNoise with dim= 1


In [7]:
#maddpg = MADDPG()

num_agents = 2
for runs in range(5):
    #maddpg.maddpg_agent[0].actor.reset_parameters()
    #maddpg.maddpg_agent[1].actor.reset_parameters()
    noise = 1
    noise_reduction = 0.999
    
    env_info = env.reset(train_mode=True)      # reset the environment    
    states_f = env_info[s_brain_name].vector_observations                  # get the current state (for each agent)
    states = states_f[:,-224:]
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    jj = 0 
    while True:        
        jj+=1
        actions = maddpg.act(torch.from_numpy(states).unsqueeze(0).float(), noise=False)
        noise *= noise_reduction  
        actions_array = torch.stack(actions).detach().numpy().squeeze()
        #print(actions_array,end="") 
        #print('\rEpisode {}\tActions <Strikers>: {:.2f} {:.2f}'.format(actions_array[0][0],actions_array[0][1]), end="")                
        
        g_actions = np.random.randint(g_action_size, size=num_g_agents)
        #g_actions = np.random.randint(1, size=num_g_agents)+2  # 0 -> towards center , 1 towards goal, 2 right, 3 left
        s_actions = actions_array.squeeze() #actions_array
        actions = dict(zip([g_brain_name, s_brain_name], 
                           [g_actions, s_actions]))
        env_info = env.step(actions)           # send all actions to the environment

        next_states_f = env_info[s_brain_name].vector_observations         # get next state (for each agent)
        next_states = next_states_f[:,-224:]
        rewards = env_info[s_brain_name].rewards                         # get reward (for each agent)
        dones = env_info[s_brain_name].local_done                        # see if episode finished
        scores += env_info[s_brain_name].rewards                         # update the score (for each agent)
        
        
        if np.any(dones):                                  # exit loop if episode finished
            break        
    print(jj)
    print(runs,'Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))


601
0 Total score (averaged over agents) this episode: -1.001666690921411
601
1 Total score (averaged over agents) this episode: -1.001666690921411
601
2 Total score (averaged over agents) this episode: -1.001666690921411
601
3 Total score (averaged over agents) this episode: -1.001666690921411
601
4 Total score (averaged over agents) this episode: -1.001666690921411


In [64]:
noise

0.7881088379839447

In [67]:
noise_reduction**600

0.5486469074854967

In [1]:
0.999**100

0.9047921471137089

In [6]:
print(0.999**500,0.999**360,0.999**150)
print(0.9995**360,0.9995**150)

0.6063789448611847 0.6975506718651009 0.8606433826830363
0.8352326125642842 0.9277260855008075
