In [1]:
import gym
import gym.spaces
import gym.wrappers
import numpy as np
import matplotlib.pyplot as plt
import random
import pickle
import time

from collections import deque
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras import optimizers
from tensorflow.keras.layers import *
from multiprocessing import Pool, freeze_support

# Build the netowrks
 - The objective of actor and critic
     - the critic network should constant improving and provide the guidance to the actor network
     - the actor network should be able to learn the optimal policy


In [2]:
def build_model_critic( lr = 0.001, size = [128,128,64]):
    # input the state and output the rating
    model = Sequential()
    # dense layer 1
    model.add(Dense(size[0], input_shape = (8,), activation = 'relu'))
    # dense layer 2
    model.add(Dense(size[1], activation = 'relu'))
    # dense layer 3
    model.add(Dense(size[2], activation = 'relu'))
    # output layer
    model.add(Dense(1, activation = 'linear'))
    # compile the model with adam optimizer and 'MSE loss'
    adam = optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999)
    model.compile(loss = 'mse', optimizer = adam)

    return model

def build_model_actor(lr = 0.001, size = [128,128,64]):
    # input the state and output the actio
    model = Sequential()
    # dense layers 1
    model.add(Dense(size[0], input_shape = (8,), activation = 'relu'))
    # dense layers 2
    model.add(Dense(size[1], activation = 'relu'))
    # dense layers 3
    model.add(Dense(size[2], activation = 'relu'))
    # choose an action to take
    model.add(Dense(4, activation = 'softmax'))
    adam = optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam)
    return model

In [3]:
def decide_action(actor, state):
	'''
	Given a state, the function returns an action chosen randomly with the probability associated with
	each action generated by the actor
	
	actor: the actor network
	state: The state of the environment
	return: The action to be taken.
	'''
    # flatten the state
	flat_state = np.reshape(state, [1,8])
    # numpy to choose a action with the probability associated with each action generated by the actor
	action = np.random.choice(4, 1, p = actor.predict(flat_state)[0])[0]
	return(action)

In [4]:
def run_episode(env, actor, render = False):
    '''
    Given an environment, actor, and a boolean flag render, run_episode(env, actor, render) returns a
    list of (state, action, reward, state_new, done) tuples and the total episode reward
    
    env: The environment to run the agent in
    actor: the actor network
    render: If you want to render the environment, set this to True, defaults to False (optional)
    return: The memory list and the episode_reward
    '''
    # creete the meomory list
    memory = []
    # reset the environment
    state = env.reset()
    # get the first action
    episode_reward = 0

    cnt = 0

    done = False

    while not done and cnt <1000:
        cnt += 1
        if render:
            env.render()
        # predict the action
        action = decide_action(actor, state)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        state_new = observation
        memory.append((state, action, reward, state_new, done))
        state = state_new

    return(memory, episode_reward)

# Training process
1) Actor observe the state S(t)
2) Actor randomly sample action A(t) from the action space
3) Actor perform action A(t) and observe the next state S(t+1) and reward R(t)
4) update the weight of the critic network using temporal difference
5) update the weight of the actor network using the policy gradient


In [5]:
def train_models(actor, critic, memory, gamma):
	'''
    # Training Process
    # observe the state S(t)
    # randomly sample action A(t) from the action space
    # perform action A(t) and observe the next state S(t+1) and reward R(t)
    # update the weight of the critic network using temporal difference
    # update the weight of the actor network using the policy gradient
    '''
	
	random.shuffle(memory)

	
	for i in range(len(memory)):
		state, action, reward, state_new, done = memory[i]
		
		
		flat_state_new = np.reshape(state_new, [1,8])
		flat_state = np.reshape(state, [1,8])

		target = np.zeros((1, 1))
		advantages = np.zeros((1, 4))

		value = critic.predict(flat_state)
		next_value = critic.predict(flat_state_new)

		if done:
            # policy gradient
			advantages[0][action] = reward - value
            # temporal difference
			target[0][0] = reward
		else:
            # policy gradient
			advantages[0][action] = reward + gamma * (next_value) - value
            # temporal difference
			target[0][0] = reward + gamma * next_value

		actor.fit(flat_state, advantages, epochs=1, verbose=0)
		critic.fit(flat_state, target, epochs=1, verbose=0)

In [7]:
def run_train_plot(alr, clr, gamma, numepisodes):
    '''
    It trains the actor and critic networks, and then runs episodes
    
    alr: learning rate for the actor
    clr: critic learning rate
    gamma: discount factor
    numepisodes: The number of episodes to run the training on
    '''

    env = gym.make('LunarLander-v2')

    i = 0

    actor = build_model_actor(lr = alr, size = [128,128,64])
    critic = build_model_critic(lr= clr, size = [128,128,64])

    totrewardarray = [] #For storing the total reward from each episode

    best = float('-inf') #For storing the best rolling average reward

    episodes = len(totrewardarray) #Counting how many episodes have passed

    while episodes < numepisodes:

        i+= 1

        memory, episode_reward = run_episode(env, actor, render = False)

        totrewardarray.append(episode_reward)

        episodes = len(totrewardarray)

        # if episodes >= 100:
        score = np.average(totrewardarray[-100:-1])
        if score > best:
            best = score
            actor.save('actormodel.h5')
            critic.save('criticmodel.h5')
        # if episodes%500==0:
        print('ALR:', alr, ' CLR:', clr, 'episode ', episodes, 'of',numepisodes, 'Average Reward (last 100 eps)= ', score)

        train_models(actor, critic, memory, gamma)

        avgarray = []
        cntarray = []

    for i in range(100,len(totrewardarray),10):
        avgarray.append(np.average(totrewardarray[i-100:i]))
        cntarray.append(i)

    plt.plot(cntarray, avgarray, label = 'Best 100 ep av. reward = '+str(best))

    plt.title('Rolling Average (previous 100) vs Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.legend(loc='best')

    plt.show()



In [8]:
run_train_plot(2e-6, 9e-5, 0.9, 2000)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


ALR: 2e-06  CLR: 9e-05 episode  1 of 2000 Average Reward (last 100 eps)=  nan
ALR: 2e-06  CLR: 9e-05 episode  2 of 2000 Average Reward (last 100 eps)=  -137.20648460673527
ALR: 2e-06  CLR: 9e-05 episode  3 of 2000 Average Reward (last 100 eps)=  -158.69660411410888
ALR: 2e-06  CLR: 9e-05 episode  4 of 2000 Average Reward (last 100 eps)=  -252.09757548716723
ALR: 2e-06  CLR: 9e-05 episode  5 of 2000 Average Reward (last 100 eps)=  -268.5074812463556
ALR: 2e-06  CLR: 9e-05 episode  6 of 2000 Average Reward (last 100 eps)=  -242.38064229706316
ALR: 2e-06  CLR: 9e-05 episode  7 of 2000 Average Reward (last 100 eps)=  -216.52783602530732
ALR: 2e-06  CLR: 9e-05 episode  8 of 2000 Average Reward (last 100 eps)=  -225.95445021501953
ALR: 2e-06  CLR: 9e-05 episode  9 of 2000 Average Reward (last 100 eps)=  -207.97468399503038
ALR: 2e-06  CLR: 9e-05 episode  10 of 2000 Average Reward (last 100 eps)=  -201.52725577519413
ALR: 2e-06  CLR: 9e-05 episode  11 of 2000 Average Reward (last 100 eps)=  -

KeyboardInterrupt: 