In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import dqn
from gym.envs.registration import register
import gym
from gym import wrappers

In [2]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 5000
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000

[2017-08-02 17:16:28,968] Making new env: CartPole-v0


In [None]:
def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)

        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    return mainDQN.update(x_stack, y_stack)

def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):

    op_holder = []

    src_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))

    return op_holder


def bot_play(mainDQN, env=env):
    state = env.reset()
    reward_sum = 0
    while True:
        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break

In [None]:
def main():
    max_episodes = 5000
    replay_buffer = deque()

    last_100_game_reward = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                next_state, reward, done, _ = env.step(action)
                if done:  
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1

            print("Episode: {}  steps: {}".format(episode, step_count))

            if episode % 10 == 1: 
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)
                sess.run(copy_ops)

            last_100_game_reward.append(step_count)

            if len(last_100_game_reward) > 100:
                last_100_game_reward.popleft()

                avg_reward = np.mean(last_100_game_reward)

                if avg_reward > 4950:
                    print("Game Cleared in ",episode, "episodes with avg reward ",avg_reward)
                    break
                    
        env2 = wrappers.Monitor(env, 'gym-results', force=True)

        for i in range(200):
            bot_play(mainDQN, env=env2)

        env2.close()
        gym.upload("gym-results", api_key="sk_VT2wPcSSOylnlPORltmQ")


if __name__ == "__main__":
    main()

Episode: 0  steps: 13
Episode: 1  steps: 24
Loss:  1.78031
Episode: 2  steps: 12
Episode: 3  steps: 11
Episode: 4  steps: 14
Episode: 5  steps: 12
Episode: 6  steps: 13
Episode: 7  steps: 12
Episode: 8  steps: 10
Episode: 9  steps: 11
Episode: 10  steps: 10
Episode: 11  steps: 9
Loss:  5.40925
Episode: 12  steps: 9
Episode: 13  steps: 10
Episode: 14  steps: 10
Episode: 15  steps: 16
Episode: 16  steps: 11
Episode: 17  steps: 10
Episode: 18  steps: 14
Episode: 19  steps: 12
Episode: 20  steps: 10
Episode: 21  steps: 8
Loss:  1312.22
Episode: 22  steps: 10
Episode: 23  steps: 12
Episode: 24  steps: 8
Episode: 25  steps: 9
Episode: 26  steps: 9
Episode: 27  steps: 8
Episode: 28  steps: 9
Episode: 29  steps: 11
Episode: 30  steps: 8
Episode: 31  steps: 10
Loss:  365.26
Episode: 32  steps: 11
Episode: 33  steps: 10
Episode: 34  steps: 11
Episode: 35  steps: 10
Episode: 36  steps: 10
Episode: 37  steps: 10
Episode: 38  steps: 9
Episode: 39  steps: 15
Episode: 40  steps: 9
Episode: 41  steps:

Loss:  0.8199
Episode: 322  steps: 166
Episode: 323  steps: 72
Episode: 324  steps: 56
Episode: 325  steps: 375
Episode: 326  steps: 150
Episode: 327  steps: 59
Episode: 328  steps: 83
Episode: 329  steps: 94
Episode: 330  steps: 103
Episode: 331  steps: 60
Loss:  2.47375
Episode: 332  steps: 58
Episode: 333  steps: 63
Episode: 334  steps: 68
Episode: 335  steps: 227
Episode: 336  steps: 215
Episode: 337  steps: 68
Episode: 338  steps: 172
Episode: 339  steps: 81
Episode: 340  steps: 77
Episode: 341  steps: 72
Loss:  1.1932
Episode: 342  steps: 87
Episode: 343  steps: 188
Episode: 344  steps: 342
Episode: 345  steps: 191
