# Chimichangas Agent Training

## Import libraries

In [None]:
import numpy as np
from collections import deque
import torch
import matplotlib.pyplot as plt

import world
from helpers import *

## Initial states of the agents

In [None]:
STATE_SIZE = 20
ACTION_SIZE = 10

In [None]:
def pad_state(state, maxlen):
    if len(state) > maxlen:
        return state[:maxlen]
    elif len(state) < maxlen:
        new_state = np.zeros((maxlen,))
        new_state[:len(state)] = state
        return new_state

In [None]:
def get_state(players, my_particles, killed):
    global STATE_SIZE
    
    initial_state = []
    for i in range(len(players)):
        if i not in killed:
            env_particles,env_particle_distance = food_in_env(players[i], my_particles)
            env_food_vector = getFoodVector(players[i],env_particles, my_particles)
            env_food_vector = sum(env_food_vector, [])

            env_players, env_player_distance = players_in_env(players[i],players)
            env_player_vector = getPlayerVector(players[i],env_players, players)
            env_player_vector = sum(env_player_vector, [])

            temp_state = [env_food_vector, env_player_vector]
            temp_state = sum(temp_state, [])
            initial_state.append(np.array(temp_state))
        else:
            initial_state.append(np.array([0]))

    initial_state = [pad_state(state, STATE_SIZE) for state in initial_state]

    return np.array(initial_state)

## Import the Multi Agent Deep Deterministic Policy Gradients (MADDPG) Agent

In [None]:
from ddpg_agent import Agent
from buffer import ReplayBuffer

In [None]:
agent_1 = Agent(state_size=STATE_SIZE, action_size=ACTION_SIZE, random_seed=0)
agent_2 = Agent(state_size=STATE_SIZE, action_size=ACTION_SIZE, memory=agent_1.memory, random_seed=0)

In [None]:
agents = [agent_1, agent_2]

## Training loop

In [None]:
def ddpg(n_episodes=2000, print_every=100):
    
    TIME = 0
    regenerate_times = 0
    MAX_REGENERATIONS = 100
    allow_regenerate = True

    
    players, killed, my_particles = world.init()
    
    states = get_state(players, my_particles, killed)
    
    scores_deque = deque(maxlen=print_every)
    scores = []
    add_noise = True
    agent_1.reset()
    agent_2.reset()
    
    scores = [0 for _ in range(len(players))]
    
    while True:
        for i, agent in enumerate(agents):
            if i not in killed:
                action = agent.act(states[i], add_noise=add_noise).tolist()
                reward, done, players, my_particles, killed, TIME = world.take_action(players, my_particles, killed, i, action, TIME)
                next_states = get_state(players, my_particles, killed)
                agent.step(states[i], action, reward, next_states[i], done)
                scores[i] += reward
                states = next_states
            
        if(len(killed) == len(players) and allow_regenerate):
            killed = []
            players = regenerate_species(TIME)
            print("GENERATION:", regenerate_times, ", score:", scores)
            regenerate_times += 1
        elif(len(killed) == INITIAL_POPULATION and not allow_regenerate):
            running = False

        if(regenerate_times == MAX_REGENERATIONS):
            allow_regenerate = False
            break
            
#         max_score = np.max(scores_ep)
#         scores_deque.append(max_score)
#         scores.append(max_score)
#         print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), 
#                                                                          max_score), end="")
#         if i_episode % print_every == 0:
#             print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
#         if np.mean(scores_deque) >= 0.5:
#             print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, 
#                                                                                          np.mean(scores_deque)))
#             torch.save(agent_1.actor_local.state_dict(), 'checkpoint_1_actor.pth')
#             torch.save(agent_1.critic_local.state_dict(), 'checkpoint_1_critic.pth')
#             torch.save(agent_2.actor_local.state_dict(), 'checkpoint_2_actor.pth')
#             torch.save(agent_2.critic_local.state_dict(), 'checkpoint_2_critic.pth')
#             break
#     return scores

In [None]:
scores = ddpg()

## Plot the average score during training

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()