In [173]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
from statistics import mean

In [174]:
from gym.wrappers import Monitor

In [175]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [176]:
import math
import copy
from torch.distributions import Categorical
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [177]:
#Hyper-parameters
max_episodes = 1000          # max training episodes
max_timesteps = 250          # max timesteps in one episode
gamma = 0.01                # discount factor
epsilon = 0.2                #need to change it to max(advantage)
dkl=1                       #need to change it to KL divergence between old and new policies
Q_r=[]
Q_r1=[]
a=[]
Q=np.zeros((max_timesteps,2))
mutation_power = 0.02#hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
#print(Q)

In [178]:
class CartPoleAI(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                        nn.Linear(4,128, bias=True),
                        nn.ReLU(),
                        nn.Linear(128,2, bias=True),
                        nn.Softmax(dim=1)
                        )

                
        def forward(self, inputs):
            x = self.fc(inputs)
            return x

In [179]:
def init_weights(m):
    
        # nn.Conv2d weights are of shape [16, 1, 3, 3] i.e. # number of filters, 1, stride, stride
        # nn.Conv2d bias is of shape [16] i.e. # number of filters
        
        # nn.Linear weights are of shape [32, 24336] i.e. # number of input features, number of output features
        # nn.Linear bias is of shape [32] i.e. # number of output features
        
        if ((type(m) == nn.Linear) | (type(m) == nn.Conv2d)):
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.00)

In [180]:
def behavioural_policy(agents):
    return agent

In [181]:
def KL_divergence(agent1,agent2):
    return KL

In [182]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
        
    return agents

In [183]:
def run_agents(agents):
    
    reward_agents = []
    env = gym.make("CartPole-v0")
    #print("Enter")
    for agent in agents:#There is only one agent in the list. But it is necessary to pass the agents in the form of lists to make it iterable and work on it
        agent.eval()
        #print("HELLLO!!!!!!")
        observation = env.reset()
        r=0
        a.clear()
        Q_r1.clear()
        Q_r.clear()
        for i in range(max_timesteps):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            #print(output_probabilities)
            Q[i][0]=output_probabilities[0]
            Q[i][1]=output_probabilities[1]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            a.append(action)
            new_observation, reward, done, info = env.step(action)
            r+=reward
            Q_r.append(r)
            observation = new_observation
            if(done):
                break

        reward_agents.append(r)
        #reward_agents.append(s)
        d_r=0
        #print("Non-discounted Reward:::",Q_r)
        for rew in reversed(Q_r):
            if rew==Q_r[len(Q_r)-1]:
                d_r=0
            else:
                d_r=rew+gamma*d_r
            Q_r1.insert(0,d_r)
            
        
    #print("Exit")
    #Q=Q[~np.all(Q==0,axis=1)]
    #print("Reward:::",reward_agents)
    #print("Probabilities:::",Q)
    #print("Actions:::",a)
    #print("Discounted Reward,i.e,Q:::",Q_r1)
    
    
    return reward_agents

In [184]:
def run_target_agents(agents):
    
    reward_agents = []
    env = gym.make("CartPole-v0")
    #print("Enter")
    for agent in agents:#There is only one agent in the list. But it is necessary to pass the agents in the form of lists to make it iterable and work on it
        agent.eval()
        #print("HELLLO!!!!!!")
        observation = env.reset()
        r=0
        s=0
        i=0
        #print("SIZE OF ACTIONS:",len(a),"SIZE OF PROBABILITIES:",Q.shape,"SIZE OF DISCOUNTED REWARDS:",len(Q_r1))
        for i in range(len(Q_r1)):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            L_pi=(output_probabilities[a[i]]/Q[i][a[i]])*Q_r1[i]
            #print("L_pi",L_pi)
            #surr1= L_pi-(4*max(Q_r1)*gamma*dkl/np.square(1-gamma))
            
            #print(output_probabilities)
            s+=L_pi
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action)
            #r+=(gamma**i)*reward
            
            #s=s+1
            observation = new_observation

            if(done):
                break
        #print("S:",s)
        #print("Max Q_r1:",max(Q_r1))
        surr1= s-(4*max(Q_r1)*gamma*dkl/np.square(1-gamma))
        #print("Surr1:",surr1)
        reward_agents.append(surr1)        
        #reward_agents.append(s)
    #print("Exit")
    #print("REWARD AGENTS",reward_agents)
    #print(reward_agents)
    return reward_agents

In [185]:
def alternate_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_target_agents([agent])[0]
    return score/runs

In [186]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score/runs

In [187]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        if agent==agents[0]:
            avg_score.append(return_average_score(agent,runs))
        else:
            avg_score.append(alternate_average_score(agent,runs))
    return avg_score

In [188]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.rand()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [189]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [190]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent

In [191]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [192]:
game_actions = 2 #2 actions possible: left or right

#disable gradients as we will not use them
torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 500
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 20

# run evolution until X generations
generations = 10

elite_index = None
n=[]
m=[]

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 3) #return average of 3 runs later
    #print(rewards)
    #sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit] #reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("")
    print("")
    top_rewards = []
    
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    #print(rewards)
    print("The minimum reward is earned is ",min(rewards),"by the ",rewards.index(min(rewards))+1,"th agent")
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    m.append(np.mean(top_rewards))
    n.append(np.mean(rewards))
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents
x=np.arange(generations)
plt.plot(x,m)
plt.plot(x,n)
plt.title('Improvement of Mean Rewards in increasing Generations(Training)')
plt.ylabel('Mean Rewards for Agents : Top 5 in Blue')
plt.xlabel('Generations')
plt.show()



Generation  0  | Mean rewards:  101.20061401856285  | Mean of top 5:  123.85526172014113
The minimum reward is earned is  21.333333333333332 by the  1 th agent
Top  20  scores [ 99 100 262 391 199 251 430 392 328 210 259 357 419 431 208 445 256 428
 173  93]
Rewards for top:  [126.29910774654284, 124.94740577349404, 123.96950116851913, 122.17889247839189, 121.88140143375772, 121.62775478418594, 120.90816661035105, 120.33193892192031, 120.15725181048526, 119.7651143559064, 119.38852866832728, 118.96066145872396, 118.68341292233616, 117.5878297205361, 117.50108208405909, 117.45517676038361, 117.34457011163074, 117.3027826556703, 117.20376024540617, 117.13261450803752]
Score for elite i  99  is  16.4
Score for elite i  100  is  15.2
Score for elite i  262  is  20.4
Score for elite i  391  is  23.8
Score for elite i  199  is  22.4
Score for elite i  251  is  22.2
Score for elite i  430  is  20.6
Score for elite i  392  is  17.4
Score for elite i  328  is  15.2
Score for elite i  210  is 

Score for elite i  68  is  15.6
Score for elite i  499  is  18.8
Elite selected with index  26  and score 24.6


Generation  8  | Mean rewards:  32.407255492434395  | Mean of top 5:  35.63486256128729
The minimum reward is earned is  15.666666666666666 by the  1 th agent
Top  20  scores [271 111 224 446 428 450 344 310 429  94  86 494 322 169 351 272  72 291
 214  61]
Rewards for top:  [36.762060853375125, 35.67603552408111, 35.423804673197104, 35.156337713666375, 35.156074042116764, 35.12779699862045, 35.00884776820399, 34.9651842776369, 34.927322167134946, 34.86697827664968, 34.843681864900624, 34.75749706720837, 34.73563978498163, 34.72710522245965, 34.72543145076763, 34.709236237042795, 34.699543738268886, 34.69261951030205, 34.67576283521677, 34.63829366313777]
Score for elite i  271  is  16.8
Score for elite i  111  is  17.4
Score for elite i  224  is  12.4
Score for elite i  446  is  16.4
Score for elite i  428  is  14.2
Score for elite i  450  is  18.6
Score for elite i  344  i

Score for elite i  224  is  14.4
Score for elite i  205  is  16.0
Score for elite i  295  is  17.6
Score for elite i  157  is  16.0
Score for elite i  156  is  22.4
Score for elite i  469  is  23.0
Score for elite i  233  is  17.8
Score for elite i  221  is  13.0
Score for elite i  423  is  13.6
Score for elite i  367  is  20.0
Score for elite i  499  is  28.0
Elite selected with index  499  and score 28.0


Generation  16  | Mean rewards:  141.26658547737026  | Mean of top 5:  271.518441152984
The minimum reward is earned is  22.666666666666668 by the  1 th agent
Top  20  scores [352  13 115  39 306 153 166 182  98 348 119 220 368 286  87  21 178 427
 292 177]
Rewards for top:  [277.7861965434696, 276.64068556809957, 272.38792355764457, 267.88678244791845, 262.89061764778785, 260.9423391434313, 258.2996103017277, 253.4989763136414, 252.16815279420265, 250.88651328082486, 250.84139116896313, 250.78340947622056, 246.06989749161062, 246.03262455562344, 245.2309492698098, 242.220972089548

KeyboardInterrupt: 

In [None]:
def play_agent(agent):
        env = gym.make("CartPole-v0")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        r=0
        j=[]
        episode_durations=[]
        timestep=0
        for timestep in range(1000):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            j.append(r)
            observation = new_observation

            if(done):
                break

        env_record.close()

        print("Rewards: ",r)

In [None]:
play_agent(agents[0])