In [1]:
import os
import datetime
from google.colab import drive
drive.mount('/content/gdrive')

os.chdir('/content/gdrive/My Drive/Reinforcement Learning/actor_critic/MADDPG')
# !ls

Mounted at /content/gdrive


In [None]:
# !pip install torch==1.4.0



In [2]:
from gym import Env
from gym.spaces import Box
import numpy as np
import matplotlib.pyplot as plt
from gym.utils import seeding
from IPython.display import clear_output
from scipy.fft import fft, ifft
from scipy.interpolate import interp1d
from scipy.io import loadmat,savemat
from maddpg import MADDPG
from buffer import MultiAgentReplayBuffer

In [3]:
def obs_list_to_state_vector(observation):
    state = np.array([])
    for obs in observation:
        state = np.concatenate([state, obs])
    return state

In [None]:
if __name__ == '__main__':
    #scenario = 'simple'
    scenario = 'simple_adversary'
    env = make_env(scenario)
    n_agents = env.n
    actor_dims = []
    for i in range(n_agents):
        actor_dims.append(env.observation_space[i].shape[0])
    critic_dims = sum(actor_dims)

    # action space is a list of arrays, assume each agent has same action space
    n_actions = env.action_space[0].n
    maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, 
                           fc1=64, fc2=64,  
                           alpha=0.01, beta=0.01, scenario=scenario,
                           chkpt_dir='tmp/maddpg/')

    memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims, 
                        n_actions, n_agents, batch_size=1024)

    PRINT_INTERVAL = 500
    N_GAMES = 50000
    MAX_STEPS = 25
    total_steps = 0
    score_history = []
    evaluate = False
    best_score = 0

    if evaluate:
        maddpg_agents.load_checkpoint()

    for i in range(N_GAMES):
        obs = env.reset()
        score = 0
        done = [False]*n_agents
        episode_step = 0
        while not any(done):
            if evaluate:
                env.render()
                #time.sleep(0.1) # to slow down the action for the video
            actions = maddpg_agents.choose_action(obs)
            obs_, reward, done, info = env.step(actions)

            state = obs_list_to_state_vector(obs)
            state_ = obs_list_to_state_vector(obs_)

            if episode_step >= MAX_STEPS:
                done = [True]*n_agents

            memory.store_transition(obs, state, actions, reward, obs_, state_, done)

            if total_steps % 100 == 0 and not evaluate:
                maddpg_agents.learn(memory)

            obs = obs_

            score += sum(reward)
            total_steps += 1
            episode_step += 1

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        if not evaluate:
            if avg_score > best_score:
                maddpg_agents.save_checkpoint()
                best_score = avg_score
        if i % PRINT_INTERVAL == 0 and i > 0:
            print('episode', i, 'average score {:.1f}'.format(avg_score))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
saved eps_2190
episode 2190 score0.8 average score -2.2 best score 1.75
saved eps_2191
episode 2191 score-1.5 average score -2.1 best score 1.75
saved eps_2192
episode 2192 score-1.8 average score -2.2 best score 1.75
saved eps_2193
episode 2193 score0.4 average score -2.1 best score 1.75
saved eps_2194
episode 2194 score-0.9 average score -2.1 best score 1.75
saved eps_2195
episode 2195 score-4.6 average score -2.1 best score 1.75
saved eps_2196
episode 2196 score-4.1 average score -2.1 best score 1.75
saved eps_2197
episode 2197 score-1.6 average score -2.1 best score 1.75
saved eps_2198
episode 2198 score-0.5 average score -2.2 best score 1.75
saved eps_2199
episode 2199 score-11.0 average score -2.2 best score 1.75
saved eps_2200
episode 2200 score-3.7 average score -2.2 best score 1.75
saved eps_2201
episode 2201 score-3.6 average score -2.2 best score 1.75
saved eps_2202
episode 2202 score-7.2 average score -2.1 bes

KeyboardInterrupt: ignored