In [1]:
import torch.nn as nn
import torch
import gym
import numpy as np
import copy

In [2]:
def cartpole_model(observation_space, action_space):
    return nn.Sequential(
        nn.Linear(observation_space, 128),
        nn.ReLU(),
        nn.Linear(128, action_space),
        nn.Softmax(dim=1)
    )

In [3]:
def init_weight(module):
    if((type(module) == nn.Linear)):
            nn.init.xavier_uniform_(module.weight.data)
            module.bias.data.fill_(0.00)

In [4]:
def create_agents(num_agents, observation_space, action_space):
    agents = []
    
    for _ in range(num_agents):
        agent = cartpole_model(observation_space, action_space)
        agent.apply(init_weight)
        
        for param in agent.parameters():
            param.requires_grad = False
        
        agent.eval()
        agents.append(agent)
        
    return agents

In [5]:
def eval_agent(agent, env):
    observation = env.reset()
    
    total_reward = 0
    for _ in range(MAX_STEP):
        observation = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
        action_probablity = agent(observation).detach().numpy()[0]
        action = np.random.choice(range(env.action_space.n), 1, p=action_probablity).item()
        next_observation, reward, terminal, _ = env.step(action)
        total_reward += reward
        observation = next_observation
        
        if terminal:
            break

    return total_reward

In [6]:
def agent_score(agent, env, runs):
    score = 0
    for _ in range(runs):
        score += eval_agent(agent, env)
        
    return score/runs 

In [7]:
def all_agent_score(agents, env, runs):
    agents_score = []
    for agent in agents:
        agents_score.append(agent_score(agent, env, runs))
    
    return agents_score

In [8]:
def mutate(agent):
    child_agent = copy.deepcopy(agent)
    
    for param in agent.parameters():
        mutation_noise = torch.randn_like(param) * MUTATION_POWER
        param += mutation_noise
        
    return child_agent

In [9]:
def elite(agents, top_parents_id, env, elite_id=None, top=10):
    selected_elites = top_parents_id[:top]
    
    if elite_id:
        selected_elites.append(elite_id)
        
    top_score = np.NINF
    top_id = None
    
    for agent_id in selected_elites:
        
        score = agent_score(agents[agent_id], env, runs=5)
        if score > top_score:
            top_score = score
            top_id = agent_id
    
    return copy.deepcopy(agents[top_id])

In [10]:
def child_agents(agents, top_parents_id, env, elite_id=None):
    children = []
    
    agent_count = len(agents)-1
    
    selected_agents_id = np.random.choice(top_parents_id, agent_count)
    selected_agents = [agents[id] for id in selected_agents_id]
    child_agents = [mutate(agent) for agent in selected_agents]
    
    child_agents.append(elite(agents, top_parents_id, env))
    elite_id = len(child_agents)-1
    
    return child_agents, elite_id

In [11]:
def top_parents(scores, num_top_parents):
    return np.argsort(rewards)[::-1][:num_top_parents]

In [12]:
ENV_NAME = "CartPole-v1"
MAX_STEP = 500
MUTATION_POWER = 0.02

num_agents = 500
num_top_parents = 20
generations = 25
elite_agent = None

In [13]:
torch.set_grad_enabled(False)
env = gym.make(ENV_NAME)

In [14]:
agents = create_agents(num_agents, env.observation_space.shape[0], env.action_space.n)

In [15]:
print(f'| Generation |     Score      |')
for gen in range(generations):
    rewards = all_agent_score(agents, env, 3)
    top_parents_id = top_parents(rewards, num_top_parents)
    agents, elite_agent = child_agents(agents, top_parents_id, env, elite_agent)
    print(f'|    {gen+1:03}     |    {np.mean([rewards[i] for i in top_parents_id[:5]]):.4f}     |')

| Generation |     Score      |
|    001     |    47.0667     |
|    002     |    47.3333     |
|    003     |    55.7333     |
|    004     |    58.2667     |
|    005     |    65.3333     |
|    006     |    88.0000     |
|    007     |    105.5333     |
|    008     |    117.4000     |
|    009     |    109.4000     |
|    010     |    137.6667     |
|    011     |    150.3333     |
|    012     |    168.6000     |
|    013     |    176.2667     |
|    014     |    248.0667     |
|    015     |    281.6667     |
|    016     |    327.9333     |
|    017     |    363.5333     |
|    018     |    375.4000     |
|    019     |    387.0000     |
|    020     |    432.2000     |
|    021     |    454.6000     |
|    022     |    445.9333     |
|    023     |    463.7333     |
|    024     |    482.1333     |
|    025     |    496.2000     |


In [18]:
def play_agent(agent, env):
    
    observation = env.reset()
    total_reward=0
    
    for _ in range(MAX_STEP):
        env.render()
        observation = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
        output_probabilities = agent(observation).detach().numpy()[0]
        action = np.random.choice(range(2), 1, p=output_probabilities).item()
        new_observation, reward, done, _ = env.step(action)
        total_reward += reward
        observation = new_observation

        if(done):
            break

    env.close()
    print("Rewards: ",total_reward)


In [19]:
play_agent(agents[num_agents-1],env)

Rewards:  350.0
