In [None]:
import gym
import numpy as np
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

import tensorflow as tf
import collections
import datetime as dt
from tensorboard import summary as summary_lib

env = gym.make('CartPole-v1')

np.random.seed(1)


class ValueEstimator():
    """
    Value Function approximator.
    """

    def __init__(self, state_size, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [None, state_size], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
#             state_one_hot = tf.one_hot(, int(state_size))
#             print(state_one_hot)
            self.input_layer = tf.layers.dense(
                inputs=self.state,
                units=24,
                activation='relu',
                kernel_initializer=tf.uniform_unit_scaling_initializer)
            self.output_layer = tf.layers.dense(
                inputs=self.input_layer,
                units=1,
                activation=None,
                kernel_initializer=tf.uniform_unit_scaling_initializer)

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.losses.mean_squared_error(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())

    def predict(self, state, sess=None):
#         state = state.reshape(-1)
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, {self.state: state})

    def update(self, state, target, sess=None):
#         state = state.reshape(-1)

        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss


class PolicyNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='policy_network'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate

        with tf.variable_scope(name):
            self.state = tf.placeholder(tf.float32, [None, self.state_size], name="state")
            self.action = tf.placeholder(tf.int32, [self.action_size], name="action")
            self.R_t = tf.placeholder(tf.float32, name="total_rewards")

            self.W1 = tf.get_variable("W1", [self.state_size, 24],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b1 = tf.get_variable("b1", [24], initializer=tf.zeros_initializer())
            self.W2 = tf.get_variable("W2", [24, self.action_size],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b2 = tf.get_variable("b2", [self.action_size], initializer=tf.zeros_initializer())

            self.Z1 = tf.add(tf.matmul(self.state, self.W1), self.b1)
            self.A1 = tf.nn.relu(self.Z1)
            self.output = tf.add(tf.matmul(self.A1, self.W2), self.b2)

            # Softmax probability distribution over actions
            self.actions_distribution = tf.squeeze(tf.nn.softmax(self.output))
            # Loss with negative log probability
            self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.action)
            self.loss = tf.reduce_mean(self.neg_log_prob * self.R_t)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)


# Define hyperparameters
state_size = 4
action_size = env.action_space.n

max_episodes = 5000
max_steps = 501
discount_factor = 0.99
learning_rate = 0.001

render = False

# Initialize the policy network
tf.reset_default_graph()
policy = PolicyNetwork(state_size, action_size, learning_rate)
value_estimator = ValueEstimator(state_size, 0.005)

LOGDIR = './TensorBoard/Q2' + f"/DQLearning_{dt.datetime.now().strftime('%d%m%Y%H%M')}"
# Start training the agent with REINFORCE algorithm
with tf.Session() as sess, tf.summary.FileWriter(LOGDIR) as tb_logger:
    sess.run(tf.global_variables_initializer())
    solved = False
    episode_rewards = np.zeros(max_episodes)
    average_rewards = 0.0
    step_done = 0

    sliding_avg = collections.deque(maxlen=100)
    avg_loss = 0.0
    for episode in range(max_episodes):
        state = env.reset()
        state = state.reshape([1, state_size])
        episode_transitions = []
        I = 1
        for step in range(max_steps):

            actions_distribution = sess.run(policy.actions_distribution, {policy.state: state})
            action = np.random.choice(np.arange(len(actions_distribution)), p=actions_distribution)
            next_state, reward, done, _ = env.step(action)
            
            next_state = next_state.reshape([1, state_size])

            if render:
                env.render()
            value_s = value_estimator.predict(state)
            value_next = 0
            if done:
                delta = reward - value_s
            else:
                value_next = value_estimator.predict(next_state)
                delta = reward + (discount_factor*value_next) - value_s
            
            total_discounted_return = I*delta
            value_estimator.update(state, reward + discount_factor*value_next)
            
            action_one_hot = np.zeros(action_size)
            action_one_hot[action] = 1
            feed_dict = {policy.state: state, policy.R_t: total_discounted_return,
                         policy.action: action_one_hot}
            _, loss = sess.run([policy.optimizer, policy.loss], feed_dict)
            avg_loss += loss
            episode_rewards[episode] += reward

            if done:
                step_done = step + 1
                sliding_avg.append(step_done)
                if episode > 98:
                    # Check if solved
                    average_rewards = np.mean(episode_rewards[(episode - 99):episode + 1])
                print("Episode {} Reward: {} Average over 100 episodes: {}".format(episode, episode_rewards[episode],
                                                                                   round(average_rewards, 2)))
                if average_rewards > 475:
                    print(' Solved at episode: ' + str(episode))
                    solved = True
                break
                
            I = discount_factor*I
            state = next_state

        if solved:
            break
            
        summary = tf.Summary(value=[tf.Summary.Value(tag='reward',
                                                     simple_value=step_done),
                                   tf.Summary.Value(tag='avg_loss',
                                                     simple_value=avg_loss / step_done),
                                   tf.Summary.Value(tag='reward_avg_100_eps',
                                                     simple_value=sum(sliding_avg) / len(sliding_avg))])
        tb_logger.add_summary(summary, episode)


Episode 0 Reward: 15.0 Average over 100 episodes: 0.0
Episode 1 Reward: 23.0 Average over 100 episodes: 0.0
Episode 2 Reward: 53.0 Average over 100 episodes: 0.0
Episode 3 Reward: 23.0 Average over 100 episodes: 0.0
Episode 4 Reward: 25.0 Average over 100 episodes: 0.0
Episode 5 Reward: 15.0 Average over 100 episodes: 0.0
Episode 6 Reward: 22.0 Average over 100 episodes: 0.0
Episode 7 Reward: 19.0 Average over 100 episodes: 0.0
Episode 8 Reward: 12.0 Average over 100 episodes: 0.0
Episode 9 Reward: 16.0 Average over 100 episodes: 0.0
Episode 10 Reward: 12.0 Average over 100 episodes: 0.0
Episode 11 Reward: 12.0 Average over 100 episodes: 0.0
Episode 12 Reward: 15.0 Average over 100 episodes: 0.0
Episode 13 Reward: 10.0 Average over 100 episodes: 0.0
Episode 14 Reward: 10.0 Average over 100 episodes: 0.0
Episode 15 Reward: 14.0 Average over 100 episodes: 0.0
Episode 16 Reward: 32.0 Average over 100 episodes: 0.0
Episode 17 Reward: 16.0 Average over 100 episodes: 0.0
Episode 18 Reward: 1

Episode 147 Reward: 60.0 Average over 100 episodes: 40.77
Episode 148 Reward: 45.0 Average over 100 episodes: 41.11
Episode 149 Reward: 58.0 Average over 100 episodes: 41.55
Episode 150 Reward: 46.0 Average over 100 episodes: 41.72
Episode 151 Reward: 50.0 Average over 100 episodes: 42.01
Episode 152 Reward: 72.0 Average over 100 episodes: 42.56
Episode 153 Reward: 95.0 Average over 100 episodes: 43.39
Episode 154 Reward: 69.0 Average over 100 episodes: 43.95
Episode 155 Reward: 45.0 Average over 100 episodes: 44.2
Episode 156 Reward: 61.0 Average over 100 episodes: 44.69
Episode 157 Reward: 86.0 Average over 100 episodes: 45.25
Episode 158 Reward: 63.0 Average over 100 episodes: 45.49
Episode 159 Reward: 115.0 Average over 100 episodes: 46.42
Episode 160 Reward: 65.0 Average over 100 episodes: 46.91
Episode 161 Reward: 98.0 Average over 100 episodes: 47.25
Episode 162 Reward: 54.0 Average over 100 episodes: 47.66
Episode 163 Reward: 101.0 Average over 100 episodes: 48.48
Episode 164 R

Episode 288 Reward: 102.0 Average over 100 episodes: 94.72
Episode 289 Reward: 129.0 Average over 100 episodes: 95.22
Episode 290 Reward: 125.0 Average over 100 episodes: 95.32
Episode 291 Reward: 131.0 Average over 100 episodes: 96.02
Episode 292 Reward: 136.0 Average over 100 episodes: 96.83
Episode 293 Reward: 144.0 Average over 100 episodes: 97.46
Episode 294 Reward: 140.0 Average over 100 episodes: 97.64
Episode 295 Reward: 180.0 Average over 100 episodes: 98.56
Episode 296 Reward: 167.0 Average over 100 episodes: 99.49
Episode 297 Reward: 245.0 Average over 100 episodes: 100.7
Episode 298 Reward: 239.0 Average over 100 episodes: 102.43
Episode 299 Reward: 255.0 Average over 100 episodes: 103.99
Episode 300 Reward: 268.0 Average over 100 episodes: 106.11
Episode 301 Reward: 397.0 Average over 100 episodes: 109.37
Episode 302 Reward: 436.0 Average over 100 episodes: 113.18
Episode 303 Reward: 132.0 Average over 100 episodes: 113.68
Episode 304 Reward: 10.0 Average over 100 episodes

Episode 428 Reward: 53.0 Average over 100 episodes: 47.07
Episode 429 Reward: 39.0 Average over 100 episodes: 46.06
Episode 430 Reward: 45.0 Average over 100 episodes: 45.19
Episode 431 Reward: 62.0 Average over 100 episodes: 44.56
Episode 432 Reward: 50.0 Average over 100 episodes: 43.78
Episode 433 Reward: 48.0 Average over 100 episodes: 43.19
Episode 434 Reward: 51.0 Average over 100 episodes: 42.35
Episode 435 Reward: 55.0 Average over 100 episodes: 41.98
Episode 436 Reward: 51.0 Average over 100 episodes: 41.52
Episode 437 Reward: 49.0 Average over 100 episodes: 41.19
Episode 438 Reward: 46.0 Average over 100 episodes: 40.99
Episode 439 Reward: 68.0 Average over 100 episodes: 40.85
Episode 440 Reward: 53.0 Average over 100 episodes: 40.92
Episode 441 Reward: 77.0 Average over 100 episodes: 41.24
Episode 442 Reward: 65.0 Average over 100 episodes: 41.31
Episode 443 Reward: 84.0 Average over 100 episodes: 41.83
Episode 444 Reward: 103.0 Average over 100 episodes: 42.46
Episode 445 R

In [3]:
env.close()