In [1]:
import gym
from collections import deque
from actor_model import Actor
from critic_model import Critic
import numpy as np
import random
import tensorflow as tf
random.seed(2212)
np.random.seed(2212)
tf.set_random_seed(2212)




In [2]:
EPISODES = 2000
REPLAY_MEMORY_SIZE = 200000
MINIMUM_REPLAY_MEMORY = 1000
DISCOUNT = 0.99
EPSILON = 1
EPSILON_DECAY = 0.999
MINIMUM_EPSILON = 0.001
MINIBATCH_SIZE = 64
VISUALIZATION = True

In [3]:
env = gym.make('CartPole-v0')
action_dim = env.action_space.n
observation_dim = env.observation_space.shape

In [4]:
sess = tf.Session()
replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
actor = Actor(sess, action_dim, observation_dim)
critic = Critic(sess, action_dim, observation_dim)
sess.run(tf.initialize_all_variables())


Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [5]:
def train_advantage_actor_critic(replay_memory, actor, critic):
    minibatch = random.sample(replay_memory, MINIBATCH_SIZE)
    X = []
    y = []
    advantages = np.zeros(shape=(MINIBATCH_SIZE, action_dim))
    for index, sample in enumerate(minibatch):
        cur_state, action, reward, next_state, done = sample
        if done:
            advantages[index][action] = reward - critic.model.predict(np.expand_dims(cur_state, axis=0))[0][0]
        else:
            next_reward = critic.model.predict(np.expand_dims(next_state, axis=0))[0][0]
            advantages[index][action] = reward + DISCOUNT * next_reward - critic.model.predict(np.expand_dims(cur_state, axis=0))[0][0]
            reward = reward + DISCOUNT * next_reward
        X.append(cur_state)
        y.append(reward)
    X = np.array(X)
    y = np.array(y)
    y = np.expand_dims(y, axis=1)
    actor.train(X, advantages)
    critic.model.fit(X, y, batch_size=MINIBATCH_SIZE, verbose=0)

In [None]:
max_reward = 0
scores_window = deque(maxlen=100)
for episode in range(EPISODES):
    cur_state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        if VISUALIZATION:
            env.render()

        action = np.zeros(shape=(action_dim))
        if(np.random.uniform(0, 1) < EPSILON):
            action[np.random.randint(0, action_dim)] = 1
        else:
            action = actor.model.predict(np.expand_dims(cur_state, axis=0))

        next_state, reward, done, _ = env.step(np.argmax(action))

        episode_reward += reward
        
        if done:
            reward = -100

        replay_memory.append((cur_state, np.argmax(action), reward, next_state, done))

        if len(replay_memory) < MINIMUM_REPLAY_MEMORY:
            continue
        train_advantage_actor_critic(replay_memory, actor, critic)
        cur_state = next_state

        if EPSILON > MINIMUM_EPSILON and len(replay_memory) >= MINIMUM_REPLAY_MEMORY:
            EPSILON *= EPSILON_DECAY
            EPSILON = max(EPSILON, MINIMUM_EPSILON)
    scores_window.append(episode_reward)
    if np.mean(scores_window)>=180.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode-100, np.mean(scores_window)))
            actor.model.save_weights(str(episode_reward)+"cartpole.h5")
            break
    max_reward = max(max_reward, episode_reward)
    print('Episodes:', episode, 'Episodic_Reweard:', episode_reward, 'Max_Reward_Achieved:', max_reward, 'EPSILON:', EPSILON)

Episodes: 0 Episodic_Reweard: 40.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 1 Episodic_Reweard: 19.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 2 Episodic_Reweard: 19.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 3 Episodic_Reweard: 15.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 4 Episodic_Reweard: 32.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 5 Episodic_Reweard: 23.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 6 Episodic_Reweard: 13.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 7 Episodic_Reweard: 12.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 8 Episodic_Reweard: 13.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 9 Episodic_Reweard: 16.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 10 Episodic_Reweard: 14.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 11 Episodic_Reweard: 13.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 12 Episodic_Reweard: 27.0 Max_Reward_Achieved: 40.0 EPSILON: 1
Episodes: 13 Episodic_Reweard: 27.0 Max_Reward_Achieved: 40.0

In [None]:
env.close()