In [6]:
import gym
import numpy as np
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

import tensorflow as tf
import collections
import datetime as dt
from tensorboard import summary as summary_lib

env = gym.make('CartPole-v1')

np.random.seed(1)


class ValueEstimator():
    """
    Value Function approximator.
    """

    def __init__(self, state_size, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [1, state_size], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
#             state_one_hot = tf.one_hot(, int(state_size))
#             print(state_one_hot)
            self.input_layer = tf.layers.dense(
                inputs=self.state,
                units=4,
                activation='relu',
                kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.output_layer = tf.layers.dense(
                inputs=self.input_layer,
                units=1,
                activation=None,
                kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.reduce_sum(tf.square(self.target - self.value_estimate))
#             self.loss = tf.losses.mean_squared_error(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())

    def predict(self, state, sess=None):
#         state = state.reshape(-1)
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, {self.state: state})

    def update(self, state, target, sess=None):
#         state = state.reshape(-1)

        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss


class PolicyNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='policy_network'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate

        with tf.variable_scope(name):
            self.state = tf.placeholder(tf.float32, [None, self.state_size], name="state")
            self.action = tf.placeholder(tf.int32, [self.action_size], name="action")
            self.R_t = tf.placeholder(tf.float32, name="total_rewards")

            self.W1 = tf.get_variable("W1", [self.state_size, 12],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b1 = tf.get_variable("b1", [12], initializer=tf.zeros_initializer())
            self.W2 = tf.get_variable("W2", [12, self.action_size],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b2 = tf.get_variable("b2", [self.action_size], initializer=tf.zeros_initializer())

            self.Z1 = tf.add(tf.matmul(self.state, self.W1), self.b1)
            self.A1 = tf.nn.relu(self.Z1)
            self.output = tf.add(tf.matmul(self.A1, self.W2), self.b2)

            # Softmax probability distribution over actions
            self.actions_distribution = tf.squeeze(tf.nn.softmax(self.output))
            # Loss with negative log probability
            self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.action)
            self.loss = tf.reduce_mean(self.neg_log_prob * self.R_t)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)


# Define hyperparameters
state_size = 4
action_size = env.action_space.n

max_episodes = 5000
max_steps = 501
discount_factor = 0.99
learning_rate = 0.001

render = False

# Initialize the policy network
tf.reset_default_graph()
policy = PolicyNetwork(state_size, action_size, learning_rate)
value_estimator = ValueEstimator(state_size, 0.005)

LOGDIR = './TensorBoard/Q2' + f"/DQLearning_{dt.datetime.now().strftime('%d%m%Y%H%M')}"
# Start training the agent with REINFORCE algorithm
with tf.Session() as sess, tf.summary.FileWriter(LOGDIR) as tb_logger:
    sess.run(tf.global_variables_initializer())
    solved = False
    episode_rewards = np.zeros(max_episodes)
    average_rewards = 0.0
    step_done = 0

    sliding_avg = collections.deque(maxlen=100)
    avg_loss = 0.0
    for episode in range(max_episodes):
        state = env.reset()
        state = state.reshape([1, state_size])
        episode_transitions = []
        I = 1
        for step in range(max_steps):

            actions_distribution = sess.run(policy.actions_distribution, {policy.state: state})
            action = np.random.choice(np.arange(len(actions_distribution)), p=actions_distribution)
            next_state, reward, done, _ = env.step(action)
            
            next_state = next_state.reshape([1, state_size])
            episode_rewards[episode] += reward
#             reward = reward if not done or step == 500 else -100
            if render:
                env.render()
            value_s = value_estimator.predict(state)
            value_next = 0
            if done:
                delta = reward - value_s
            else:
                value_next = value_estimator.predict(next_state)
                delta = reward + (discount_factor*value_next) - value_s
            
            total_discounted_return = I*delta
            value_estimator.update(state, reward + discount_factor*value_next)
            
            action_one_hot = np.zeros(action_size)
            action_one_hot[action] = 1
        
            feed_dict = {policy.state: state, policy.R_t: total_discounted_return,
                         policy.action: action_one_hot}
            _, loss = sess.run([policy.optimizer, policy.loss], feed_dict)
#             if loss < -150:
#                 print(step, total_discounted_return, reward, value_s, value_next)
            avg_loss += loss

            if done:
                step_done = step + 1
                sliding_avg.append(step_done)
                if episode > 98:
                    # Check if solved
                    average_rewards = np.mean(episode_rewards[(episode - 99):episode + 1])
                print("Episode {} Reward: {} Average over 100 episodes: {}".format(episode, episode_rewards[episode],
                                                                                   round(average_rewards, 2)))
                if average_rewards > 475:
                    print(' Solved at episode: ' + str(episode))
                    solved = True
                break
                
#             I = discount_factor*I
            state = next_state

        if solved:
            break
            
        summary = tf.Summary(value=[tf.Summary.Value(tag='reward',
                                                     simple_value=step_done),
                                   tf.Summary.Value(tag='avg_loss',
                                                     simple_value=avg_loss / step_done),
                                   tf.Summary.Value(tag='reward_avg_100_eps',
                                                     simple_value=sum(sliding_avg) / len(sliding_avg))])
        tb_logger.add_summary(summary, episode)


Episode 0 Reward: 18.0 Average over 100 episodes: 0.0
Episode 1 Reward: 26.0 Average over 100 episodes: 0.0
Episode 2 Reward: 12.0 Average over 100 episodes: 0.0
Episode 3 Reward: 20.0 Average over 100 episodes: 0.0
Episode 4 Reward: 12.0 Average over 100 episodes: 0.0
Episode 5 Reward: 11.0 Average over 100 episodes: 0.0
Episode 6 Reward: 15.0 Average over 100 episodes: 0.0
Episode 7 Reward: 29.0 Average over 100 episodes: 0.0
Episode 8 Reward: 15.0 Average over 100 episodes: 0.0
Episode 9 Reward: 14.0 Average over 100 episodes: 0.0
Episode 10 Reward: 22.0 Average over 100 episodes: 0.0
Episode 11 Reward: 11.0 Average over 100 episodes: 0.0
Episode 12 Reward: 15.0 Average over 100 episodes: 0.0
Episode 13 Reward: 20.0 Average over 100 episodes: 0.0
Episode 14 Reward: 19.0 Average over 100 episodes: 0.0
Episode 15 Reward: 9.0 Average over 100 episodes: 0.0
Episode 16 Reward: 26.0 Average over 100 episodes: 0.0
Episode 17 Reward: 14.0 Average over 100 episodes: 0.0
Episode 18 Reward: 38

Episode 147 Reward: 67.0 Average over 100 episodes: 20.53
Episode 148 Reward: 10.0 Average over 100 episodes: 20.51
Episode 149 Reward: 20.0 Average over 100 episodes: 20.59
Episode 150 Reward: 19.0 Average over 100 episodes: 20.63
Episode 151 Reward: 34.0 Average over 100 episodes: 20.88
Episode 152 Reward: 31.0 Average over 100 episodes: 20.97
Episode 153 Reward: 56.0 Average over 100 episodes: 21.41
Episode 154 Reward: 11.0 Average over 100 episodes: 21.35
Episode 155 Reward: 39.0 Average over 100 episodes: 21.64
Episode 156 Reward: 16.0 Average over 100 episodes: 21.65
Episode 157 Reward: 20.0 Average over 100 episodes: 21.53
Episode 158 Reward: 20.0 Average over 100 episodes: 21.55
Episode 159 Reward: 15.0 Average over 100 episodes: 21.59
Episode 160 Reward: 11.0 Average over 100 episodes: 21.5
Episode 161 Reward: 50.0 Average over 100 episodes: 21.85
Episode 162 Reward: 25.0 Average over 100 episodes: 21.86
Episode 163 Reward: 13.0 Average over 100 episodes: 21.78
Episode 164 Rew

Episode 289 Reward: 86.0 Average over 100 episodes: 39.25
Episode 290 Reward: 151.0 Average over 100 episodes: 40.41
Episode 291 Reward: 38.0 Average over 100 episodes: 40.69
Episode 292 Reward: 53.0 Average over 100 episodes: 40.98
Episode 293 Reward: 66.0 Average over 100 episodes: 41.16
Episode 294 Reward: 76.0 Average over 100 episodes: 41.8
Episode 295 Reward: 57.0 Average over 100 episodes: 42.23
Episode 296 Reward: 69.0 Average over 100 episodes: 42.36
Episode 297 Reward: 41.0 Average over 100 episodes: 42.46
Episode 298 Reward: 39.0 Average over 100 episodes: 42.68
Episode 299 Reward: 89.0 Average over 100 episodes: 43.36
Episode 300 Reward: 50.0 Average over 100 episodes: 43.66
Episode 301 Reward: 119.0 Average over 100 episodes: 44.38
Episode 302 Reward: 33.0 Average over 100 episodes: 44.49
Episode 303 Reward: 71.0 Average over 100 episodes: 44.94
Episode 304 Reward: 73.0 Average over 100 episodes: 45.53
Episode 305 Reward: 32.0 Average over 100 episodes: 45.65
Episode 306 R

Episode 431 Reward: 64.0 Average over 100 episodes: 116.32
Episode 432 Reward: 140.0 Average over 100 episodes: 117.28
Episode 433 Reward: 81.0 Average over 100 episodes: 117.1
Episode 434 Reward: 79.0 Average over 100 episodes: 117.23
Episode 435 Reward: 49.0 Average over 100 episodes: 117.1
Episode 436 Reward: 72.0 Average over 100 episodes: 117.42
Episode 437 Reward: 103.0 Average over 100 episodes: 117.82
Episode 438 Reward: 122.0 Average over 100 episodes: 118.27
Episode 439 Reward: 214.0 Average over 100 episodes: 120.03
Episode 440 Reward: 151.0 Average over 100 episodes: 120.83
Episode 441 Reward: 238.0 Average over 100 episodes: 122.57
Episode 442 Reward: 236.0 Average over 100 episodes: 124.52
Episode 443 Reward: 230.0 Average over 100 episodes: 125.33
Episode 444 Reward: 322.0 Average over 100 episodes: 127.79
Episode 445 Reward: 220.0 Average over 100 episodes: 129.53
Episode 446 Reward: 90.0 Average over 100 episodes: 129.76
Episode 447 Reward: 108.0 Average over 100 episo

Episode 570 Reward: 109.0 Average over 100 episodes: 233.45
Episode 571 Reward: 161.0 Average over 100 episodes: 232.38
Episode 572 Reward: 142.0 Average over 100 episodes: 231.86
Episode 573 Reward: 255.0 Average over 100 episodes: 231.78
Episode 574 Reward: 364.0 Average over 100 episodes: 234.12
Episode 575 Reward: 336.0 Average over 100 episodes: 234.02
Episode 576 Reward: 500.0 Average over 100 episodes: 236.65
Episode 577 Reward: 208.0 Average over 100 episodes: 236.66
Episode 578 Reward: 210.0 Average over 100 episodes: 237.05
Episode 579 Reward: 146.0 Average over 100 episodes: 236.22
Episode 580 Reward: 151.0 Average over 100 episodes: 235.16
Episode 581 Reward: 155.0 Average over 100 episodes: 233.77
Episode 582 Reward: 113.0 Average over 100 episodes: 230.04
Episode 583 Reward: 128.0 Average over 100 episodes: 228.28
Episode 584 Reward: 110.0 Average over 100 episodes: 225.03
Episode 585 Reward: 130.0 Average over 100 episodes: 223.74
Episode 586 Reward: 124.0 Average over 1

Episode 708 Reward: 158.0 Average over 100 episodes: 203.29
Episode 709 Reward: 306.0 Average over 100 episodes: 204.78
Episode 710 Reward: 500.0 Average over 100 episodes: 208.38
Episode 711 Reward: 500.0 Average over 100 episodes: 211.81
Episode 712 Reward: 500.0 Average over 100 episodes: 215.25
Episode 713 Reward: 500.0 Average over 100 episodes: 218.46
Episode 714 Reward: 500.0 Average over 100 episodes: 221.47
Episode 715 Reward: 500.0 Average over 100 episodes: 223.79
Episode 716 Reward: 500.0 Average over 100 episodes: 223.79
Episode 717 Reward: 500.0 Average over 100 episodes: 224.79
Episode 718 Reward: 328.0 Average over 100 episodes: 226.68
Episode 719 Reward: 221.0 Average over 100 episodes: 225.92
Episode 720 Reward: 217.0 Average over 100 episodes: 226.08
Episode 721 Reward: 308.0 Average over 100 episodes: 227.95
Episode 722 Reward: 346.0 Average over 100 episodes: 228.47
Episode 723 Reward: 420.0 Average over 100 episodes: 229.21
Episode 724 Reward: 228.0 Average over 1

Episode 845 Reward: 500.0 Average over 100 episodes: 276.48
Episode 846 Reward: 500.0 Average over 100 episodes: 279.92
Episode 847 Reward: 500.0 Average over 100 episodes: 283.36
Episode 848 Reward: 500.0 Average over 100 episodes: 286.64
Episode 849 Reward: 500.0 Average over 100 episodes: 289.98
Episode 850 Reward: 500.0 Average over 100 episodes: 293.23
Episode 851 Reward: 500.0 Average over 100 episodes: 295.99
Episode 852 Reward: 273.0 Average over 100 episodes: 296.81
Episode 853 Reward: 262.0 Average over 100 episodes: 296.62
Episode 854 Reward: 201.0 Average over 100 episodes: 296.41
Episode 855 Reward: 226.0 Average over 100 episodes: 296.73
Episode 856 Reward: 220.0 Average over 100 episodes: 296.72
Episode 857 Reward: 177.0 Average over 100 episodes: 296.35
Episode 858 Reward: 111.0 Average over 100 episodes: 295.3
Episode 859 Reward: 112.0 Average over 100 episodes: 294.25
Episode 860 Reward: 14.0 Average over 100 episodes: 292.31
Episode 861 Reward: 18.0 Average over 100 

Episode 990 Reward: 22.0 Average over 100 episodes: 253.5
Episode 991 Reward: 12.0 Average over 100 episodes: 248.62
Episode 992 Reward: 28.0 Average over 100 episodes: 243.9
Episode 993 Reward: 15.0 Average over 100 episodes: 239.05
Episode 994 Reward: 12.0 Average over 100 episodes: 234.17
Episode 995 Reward: 10.0 Average over 100 episodes: 229.27
Episode 996 Reward: 24.0 Average over 100 episodes: 224.51
Episode 997 Reward: 26.0 Average over 100 episodes: 219.77
Episode 998 Reward: 36.0 Average over 100 episodes: 215.13
Episode 999 Reward: 31.0 Average over 100 episodes: 210.44
Episode 1000 Reward: 11.0 Average over 100 episodes: 205.55
Episode 1001 Reward: 10.0 Average over 100 episodes: 200.65
Episode 1002 Reward: 24.0 Average over 100 episodes: 195.89
Episode 1003 Reward: 17.0 Average over 100 episodes: 191.06
Episode 1004 Reward: 20.0 Average over 100 episodes: 186.26
Episode 1005 Reward: 14.0 Average over 100 episodes: 181.4
Episode 1006 Reward: 26.0 Average over 100 episodes: 

Episode 1128 Reward: 167.0 Average over 100 episodes: 93.6
Episode 1129 Reward: 158.0 Average over 100 episodes: 94.95
Episode 1130 Reward: 169.0 Average over 100 episodes: 96.4
Episode 1131 Reward: 200.0 Average over 100 episodes: 98.26
Episode 1132 Reward: 212.0 Average over 100 episodes: 100.02
Episode 1133 Reward: 193.0 Average over 100 episodes: 101.82
Episode 1134 Reward: 168.0 Average over 100 episodes: 103.05
Episode 1135 Reward: 202.0 Average over 100 episodes: 104.95
Episode 1136 Reward: 202.0 Average over 100 episodes: 106.78
Episode 1137 Reward: 203.0 Average over 100 episodes: 108.68
Episode 1138 Reward: 235.0 Average over 100 episodes: 110.41
Episode 1139 Reward: 249.0 Average over 100 episodes: 112.69
Episode 1140 Reward: 234.0 Average over 100 episodes: 114.88
Episode 1141 Reward: 316.0 Average over 100 episodes: 117.57
Episode 1142 Reward: 341.0 Average over 100 episodes: 120.69
Episode 1143 Reward: 271.0 Average over 100 episodes: 123.27
Episode 1144 Reward: 312.0 Ave

Episode 1263 Reward: 247.0 Average over 100 episodes: 182.29
Episode 1264 Reward: 500.0 Average over 100 episodes: 183.96
Episode 1265 Reward: 500.0 Average over 100 episodes: 186.31
Episode 1266 Reward: 353.0 Average over 100 episodes: 188.29
Episode 1267 Reward: 147.0 Average over 100 episodes: 187.92
Episode 1268 Reward: 211.0 Average over 100 episodes: 188.77
Episode 1269 Reward: 500.0 Average over 100 episodes: 193.0
Episode 1270 Reward: 500.0 Average over 100 episodes: 197.28
Episode 1271 Reward: 500.0 Average over 100 episodes: 201.0
Episode 1272 Reward: 500.0 Average over 100 episodes: 205.18
Episode 1273 Reward: 500.0 Average over 100 episodes: 209.42
Episode 1274 Reward: 500.0 Average over 100 episodes: 213.45
Episode 1275 Reward: 500.0 Average over 100 episodes: 217.31
Episode 1276 Reward: 500.0 Average over 100 episodes: 220.84
Episode 1277 Reward: 500.0 Average over 100 episodes: 224.67
Episode 1278 Reward: 500.0 Average over 100 episodes: 228.26
Episode 1279 Reward: 500.0

Episode 1398 Reward: 149.0 Average over 100 episodes: 274.67
Episode 1399 Reward: 175.0 Average over 100 episodes: 273.18
Episode 1400 Reward: 193.0 Average over 100 episodes: 272.56
Episode 1401 Reward: 185.0 Average over 100 episodes: 272.57
Episode 1402 Reward: 211.0 Average over 100 episodes: 273.1
Episode 1403 Reward: 208.0 Average over 100 episodes: 275.0
Episode 1404 Reward: 193.0 Average over 100 episodes: 275.68
Episode 1405 Reward: 187.0 Average over 100 episodes: 277.35
Episode 1406 Reward: 177.0 Average over 100 episodes: 278.95
Episode 1407 Reward: 158.0 Average over 100 episodes: 280.37
Episode 1408 Reward: 156.0 Average over 100 episodes: 281.77
Episode 1409 Reward: 155.0 Average over 100 episodes: 283.13
Episode 1410 Reward: 130.0 Average over 100 episodes: 284.3
Episode 1411 Reward: 128.0 Average over 100 episodes: 285.45
Episode 1412 Reward: 146.0 Average over 100 episodes: 286.73
Episode 1413 Reward: 175.0 Average over 100 episodes: 288.35
Episode 1414 Reward: 220.0 

KeyboardInterrupt: 

In [3]:
env.close()