In [9]:
#import libraries
import gym 
import numpy as np 
import math 
import tensorflow as tf
from matplotlib import pyplot as plt
from random import randint
from statistics import median, mean 


In [10]:
env = gym.make('CartPole-v0')
no_states = env.observation_space.shape[0]
no_actions = env.action_space.n

In [11]:
#function to generate initial network parameters
def initial(run_test):
    #initialise arrays
    i_w = []
    i_b = []
    h_w = []
    o_w = []
    no_input_nodes = 8
    no_hidden_nodes = 4
    
    for r in range(run_test):
        input_weight = np.random.rand(no_states, no_input_nodes)
        input_bias = np.random.rand((no_input_nodes))
        hidden_weight = np.random.rand(no_input_nodes,no_hidden_nodes)
        output_weight = np.random.rand(no_hidden_nodes, no_actions)
        
        i_w.append(input_weight)
        i_b.append(input_bias)
        h_w.append(hidden_weight)
        o_w.append(output_weight)
        
    chromosome =[i_w, i_b, h_w, o_w]
    return chromosome        

In [12]:
#function for deep neural network model
def nnmodel(observations, i_w, i_b, h_w, o_w):
    alpha = 0.199
    observations = observations/max(np.max(np.linalg.norm(observations)),1)
    #apply relu on layers
    funct1 = np.dot(observations, i_w)+ i_b.T
    layer1= tf.nn.relu(funct1)-alpha*tf.nn.relu(-funct1)
    funct2 = np.dot(layer1,h_w)
    layer2 = tf.nn.relu(funct2) - alpha*tf.nn.relu(-funct2)
    funct3 = np.dot(layer2, o_w)
    layer3 = tf.nn.relu(funct3)-alpha*tf.nn.relu(-funct3)
    #apply softmax
    layer3 = np.exp(layer3)/np.sum(np.exp(layer3))
    output = layer3.argsort().reshape(1,no_actions)
    action = output[0][0]
    
    return action
    

In [13]:
#function for getting the reward
def get_reward(env, i_w, i_b, h_w, o_w):
    current_state = env.reset()
    total_reward = 0
    for step in range(300):
        action = nnmodel(current_state, i_w, i_b, h_w, o_w)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        current_state = next_state
        if done:
            break
    return total_reward

In [14]:
#function for an initial run to get weights
def get_weights(env, run_test):
    rewards = []
    chromosomes = initial(run_test)
    for trial in range(run_test):
        i_w = chromosomes[0][trial]
        i_b = chromosomes[1][trial]
        h_w = chromosomes[2][trial]
        o_w = chromosomes[3][trial]
        total_reward = get_reward(env, i_w, i_b, h_w, o_w)
        rewards = np.append(rewards, total_reward)
    chromosome_weight = [chromosomes, rewards]
    return chromosome_weight

In [15]:
#function for mutation:
def mutate(parent):
    index = np.random.randint(0, len(parent))
    if(0 < index < 10):
        for idx in range(index):
            n = np.random.randint(0, len(parent))
            parent[n] = parent[n] + np.random.rand()
    mutation = parent
    return mutation

In [16]:
#function for cross-over
def crossover(list_chr):
    gen_list = []
    gen_list.append(list_chr[0])
    gen_list.append(list_chr[1])
    
    for i in range(10):
        m = np.random.randint(0, len(list_chr[0]))
        parent = np.append(list_chr[0][:m], list_chr[1][m:])
        child = mutate(parent)
        gen_list.append(child)
    return gen_list

In [17]:
#function for a new generation of parameters
def generate_new_population(rewards, chromosomes):
    #2 best reward indexes selected
    best_reward_idx = rewards.argsort()[-2:][::-1]
    list_chr = []
    new_i_w =[]
    new_i_b = []
    new_h_w = []
    new_o_w = []
    new_rewards = []
    
    #get current parameters
    for ind in best_reward_idx:
        weight1 = chromosomes[0][ind]
        w1 = weight1.reshape(weight1.shape[1], -1)
        
        bias1 = chromosomes[1][ind]
        b1 = np.append(w1, bias1)
        
        weight2 = chromosomes[2][ind]
        w2 = np.append(b1, weight2.reshape(weight2.shape[1], -1))
        
        weight3 = chromosomes[3][ind]
        chr = np.append(w2, weight3)
        #the 2 best parents are selected
        list_chr.append(chr)
        
    gen_list = crossover(list_chr)
    
    for l in gen_list:
        chromosome_w1 = np.array(l[:chromosomes[0][0].size])
        new_input_weight = np.reshape(chromosome_w1,(-1,chromosomes[0][0].shape[1]))
        new_input_bias = np.array([l[chromosome_w1.size:chromosome_w1.size+chromosomes[1][0].size]]).T
        hidden = chromosome_w1.size + new_input_bias.size
        chromosome_w2 = np.array([l[hidden:hidden + chromosomes[2][0].size]])
        new_hidden_weight = np.reshape(chromosome_w2, (-1, chromosomes[2][0].shape[1]))
        final = chromosome_w1.size+new_input_bias.size+chromosome_w2.size
        new_output_weight = np.array([l[final:]]).T
        new_output_weight = np.reshape(new_output_weight,(-1, chromosomes[3][0].shape[1]))
        
        new_i_w.append(new_input_weight)
        new_i_b.append(new_input_bias)
        new_h_w.append(new_hidden_weight)
        new_o_w.append(new_output_weight)
        
        new_reward = get_reward(env, new_input_weight, new_input_bias, new_hidden_weight, new_output_weight)
        new_rewards = np.append(new_rewards, new_reward)
        
    generation = [new_i_w, new_i_b, new_h_w, new_o_w]
    
    return generation, new_rewards
        
    

In [18]:
#function to print graphics
def graphics(act):
    plt.plot(act)
    plt.xlabel('No. of generations')
    plt.ylabel('Rewards')
    plt.grid()
    
    print('Mean rewards:', mean(act))
    return plt.show()
    

In [19]:
#run genetic alhorithm
def ga_algo(env, run_test, no_gen):
    weights = get_weights(env, run_test)
    
    chrom = weights[0]
    current_rewards = weights[1]
    act = []
    
    for n in range(no_gen):
        gen, new_rewards = generate_new_population(current_rewards, chrom)
        average = np.average(current_rewards)
        new_average = np.average(new_rewards)
        if average >  new_average:
            parameters = [chrom[0][0], chrom[1][0], chrom[2][0], chrom[3][0]]
        else:
             parameters = [gen[0][0], gen[1][0], gen[2][0], gen[3][0]]
        chrom = gen
        current_rewards = new_rewards
        max_arg = np.amax(current_rewards)
        print('Generation:{}, max reward:{}'.format(n+1, max_arg))
        act = np.append(act, max_arg)

    
    graphics(act)
    
    return parameters
    
    

In [20]:
#function for outputting the initial parameters
def params(parameters):
    i_w = parameters[0]
    i_b = parameters[1]
    h_w = parameters[2]
    o_w = parameters[3]
    
    return i_w,i_b,h_w,o_w