# REINFORCE

### 1. Import the Necessary Packages

In [1]:
import numpy as np
from collections import deque
import matplotblib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(0) # set random seed
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import world
from helpers import *

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


## 1.5 Initial states of the agents

In [2]:
STATE_SIZE = 20
ACTION_SIZE = 10

In [3]:
def pad_state(state, maxlen):
    if len(state) > maxlen:
        return state[:maxlen]
    elif len(state) < maxlen:
        new_state = np.zeros((maxlen,))
        new_state[:len(state)] = state
        return new_state

In [4]:
def get_state(players, my_particles, killed):
    global STATE_SIZE
    
    initial_state = []
    for i in range(len(players)):
        if i not in killed:
            env_particles,env_particle_distance = food_in_env(players[i], my_particles)
            env_food_vector = getFoodVector(players[i],env_particles, my_particles)
            env_food_vector = sum(env_food_vector, [])

            env_players, env_player_distance = players_in_env(players[i],players)
            env_player_vector = getPlayerVector(players[i],env_players, players)
            env_player_vector = sum(env_player_vector, [])

            temp_state = [env_food_vector, env_player_vector]
            temp_state = sum(temp_state, [])
            initial_state.append(np.array(temp_state))
        else:
            initial_state.append(np.array([0]))

    initial_state = [pad_state(state, STATE_SIZE) for state in initial_state]

    return np.array(initial_state)

### 2. Define the Architecture of the Policy

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Policy(nn.Module):
    def __init__(self, s_size=STATE_SIZE, h_size=16, a_size=ACTION_SIZE):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

### 3. Train the Agent with REINFORCE

In [None]:
policy_1 = Policy().to(device)
policy_2 = Policy().to(device)
agents = [policy_1, policy_2]
optimizers = [optim.Adam(policy.parameters(), lr=1e-2) for policy in agents]


def reinforce(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    
    TIME = 0
    regenerate_times = 0
    MAX_REGENERATIONS = 100
    allow_regenerate = True

    
    players, killed, my_particles = world.init()
    
    states = get_state(players, my_particles, killed)

    scores = [0 for _ in range(len(players))]
    saved_log_probs = {i:[] for i in range(len(players))}
    rewards = {i:[] for i in range(len(players))}
    
    while True:
        for i, agent in enumerate(agents):
            if i not in killed:
                action, log_prob = agents[i].act(states[i])
                saved_log_probs[i].append(log_prob)
                reward, done, players, my_particles, killed, TIME = world.take_action(players, my_particles, killed, i, action, TIME)
                next_states = get_state(players, my_particles, killed)
                rewards[i].append(reward)
                scores[i] += reward
                states = next_states
            
        if(len(killed) == len(players) and allow_regenerate):
            discounts = {j:[gamma**i for i in range(len(rewards[j])+1)] for j in range(len(players))}
            R = {j:sum([a*b for a,b in zip(discounts[j], rewards[j])]) for j in range(len(players))}
            
            policy_loss = {i:[] for i in range(len(players))}
            for i, saved_log_prob in saved_log_probs.items():
                for log_prob in saved_log_prob:
                    policy_loss[i].append(-log_prob * R[i])
                policy_loss[i] = torch.cat(policy_loss[i]).sum()
            
                optimizers[i].zero_grad()
                policy_loss[i].backward(retain_graph=True)
                optimizers[i].step()
            
            killed = []
            players = regenerate_species(TIME)
            print("GENERATION:", regenerate_times, ", score:", scores)
            regenerate_times += 1
        elif(len(killed) == INITIAL_POPULATION and not allow_regenerate):
            running = False

        if(regenerate_times == MAX_REGENERATIONS):
            allow_regenerate = False
            break
        
        
#         if i_episode % print_every == 0:
#             print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
#         if np.mean(scores_deque)>=200.0:
#             print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
#             break
        
#     return scores
    
scores = reinforce()

GENERATION: 0 , score: [-339, -44]
GENERATION: 1 , score: [-518, -257]
GENERATION: 2 , score: [-859, -299]


### 4. Plot the Scores

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()