In [8]:
import gym
import numpy as np
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

import tensorflow as tf
import collections
import datetime as dt
from tensorboard import summary as summary_lib

env = gym.make('CartPole-v1')

np.random.seed(1)


class ValueEstimator():
    """
    Value Function approximator.
    """

    def __init__(self, state_size, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [state_size], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
            state_one_hot = tf.one_hot(self.state, int(state_size))
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(state_one_hot, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())

    def predict(self, state, sess=None):
        state = state.reshape(-1)
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, {self.state: state})

    def update(self, state, target, sess=None):
        state = state.reshape(-1)

        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss


class PolicyNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='policy_network'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate

        with tf.variable_scope(name):
            self.state = tf.placeholder(tf.float32, [None, self.state_size], name="state")
            self.action = tf.placeholder(tf.int32, [self.action_size], name="action")
            self.R_t = tf.placeholder(tf.float32, name="total_rewards")

            self.W1 = tf.get_variable("W1", [self.state_size, 12],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b1 = tf.get_variable("b1", [12], initializer=tf.zeros_initializer())
            self.W2 = tf.get_variable("W2", [12, self.action_size],
                                      initializer=tf.contrib.layers.xavier_initializer(seed=0))
            self.b2 = tf.get_variable("b2", [self.action_size], initializer=tf.zeros_initializer())

            self.Z1 = tf.add(tf.matmul(self.state, self.W1), self.b1)
            self.A1 = tf.nn.relu(self.Z1)
            self.output = tf.add(tf.matmul(self.A1, self.W2), self.b2)

            # Softmax probability distribution over actions
            self.actions_distribution = tf.squeeze(tf.nn.softmax(self.output))
            # Loss with negative log probability
            self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.action)
            self.loss = tf.reduce_mean(self.neg_log_prob * self.R_t)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)


# Define hyperparameters
state_size = 4
action_size = env.action_space.n

max_episodes = 5000
max_steps = 501
discount_factor = 0.99
learning_rate = 0.0004

render = False

# Initialize the policy network
tf.reset_default_graph()
policy = PolicyNetwork(state_size, action_size, learning_rate)
value_estimator = ValueEstimator(state_size)

LOGDIR = './TensorBoard/Q2' + f"/DQLearning_{dt.datetime.now().strftime('%d%m%Y%H%M')}"
# Start training the agent with REINFORCE algorithm
with tf.Session() as sess, tf.summary.FileWriter(LOGDIR) as tb_logger:
    sess.run(tf.global_variables_initializer())
    solved = False
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    episode_rewards = np.zeros(max_episodes)
    average_rewards = 0.0
    step_done = 0
#     step = tf.placeholder(tf.int32)
#     tf.summary.scalar('reward', step)
#     summaries = tf.summary.merge_all()
    sliding_avg = collections.deque(maxlen=100)
    # step_done = tf.get_variable('step_done', shape=[])
    i = 0
    for episode in range(max_episodes):
        state = env.reset()
        state = state.reshape([1, state_size])
        episode_transitions = []

        for step in range(max_steps):

            actions_distribution = sess.run(policy.actions_distribution, {policy.state: state})
            action = np.random.choice(np.arange(len(actions_distribution)), p=actions_distribution)
            next_state, reward, done, _ = env.step(action)
            next_state = next_state.reshape([1, state_size])

            if render:
                env.render()

            action_one_hot = np.zeros(action_size)
            action_one_hot[action] = 1
            episode_transitions.append(
                    Transition(state=state, action=action_one_hot, reward=reward, next_state=next_state, done=done))
            episode_rewards[episode] += reward

            if done:
                step_done = step + 1
                sliding_avg.append(step_done)
                if episode > 98:
                    # Check if solved
                    average_rewards = np.mean(episode_rewards[(episode - 99):episode + 1])
                print("Episode {} Reward: {} Average over 100 episodes: {}".format(episode, episode_rewards[episode],
                                                                                   round(average_rewards, 2)))
                if average_rewards > 475:
                    print(' Solved at episode: ' + str(episode))
                    solved = True
                break
            state = next_state

        if solved:
            break
        avg_loss = 0.0
        # Compute Rt for each time-step t and update the network's weights
        for t, transition in enumerate(episode_transitions):
            total_discounted_return = sum(
                discount_factor ** i * t.reward for i, t in enumerate(episode_transitions[t:]))  # Rt

            baseline_value = value_estimator.predict(transition.state)
            advantage = total_discounted_return - baseline_value
            value_estimator.update(transition.state, total_discounted_return)

            feed_dict = {policy.state: transition.state, policy.R_t: advantage,
                         policy.action: transition.action}
            _, loss = sess.run([policy.optimizer, policy.loss], feed_dict)
            avg_loss += loss

#         tb_logger.add_summary(step_done, episode)
#         tb_logger.add_summary(avg_loss / step_done, episode)
#         tb_logger.add_summary(sum(sliding_avg) / len(sliding_avg), episode)
#         tf.summary.scalar('reward', step_done, step=episode)
#         tf.summary.scalar('avg loss', avg_loss / step_done, step=episode)
#         tf.summary.scalar('reward_avg_100_eps', sum(sliding_avg) / len(sliding_avg), step=episode)
        summary = tf.Summary(value=[tf.Summary.Value(tag='reward',
                                                     simple_value=step_done),
                                   tf.Summary.Value(tag='avg_loss',
                                                     simple_value=avg_loss / step_done),
                                   tf.Summary.Value(tag='reward_avg_100_eps',
                                                     simple_value=sum(sliding_avg) / len(sliding_avg))])
        tb_logger.add_summary(summary, episode)


Episode 0 Reward: 11.0 Average over 100 episodes: 0.0
Episode 1 Reward: 32.0 Average over 100 episodes: 0.0
Episode 2 Reward: 15.0 Average over 100 episodes: 0.0
Episode 3 Reward: 21.0 Average over 100 episodes: 0.0
Episode 4 Reward: 9.0 Average over 100 episodes: 0.0
Episode 5 Reward: 11.0 Average over 100 episodes: 0.0
Episode 6 Reward: 14.0 Average over 100 episodes: 0.0
Episode 7 Reward: 17.0 Average over 100 episodes: 0.0
Episode 8 Reward: 13.0 Average over 100 episodes: 0.0
Episode 9 Reward: 14.0 Average over 100 episodes: 0.0
Episode 10 Reward: 16.0 Average over 100 episodes: 0.0
Episode 11 Reward: 27.0 Average over 100 episodes: 0.0
Episode 12 Reward: 10.0 Average over 100 episodes: 0.0
Episode 13 Reward: 16.0 Average over 100 episodes: 0.0
Episode 14 Reward: 9.0 Average over 100 episodes: 0.0
Episode 15 Reward: 11.0 Average over 100 episodes: 0.0
Episode 16 Reward: 18.0 Average over 100 episodes: 0.0
Episode 17 Reward: 12.0 Average over 100 episodes: 0.0
Episode 18 Reward: 20.

Episode 147 Reward: 17.0 Average over 100 episodes: 24.66
Episode 148 Reward: 23.0 Average over 100 episodes: 24.68
Episode 149 Reward: 41.0 Average over 100 episodes: 24.99
Episode 150 Reward: 29.0 Average over 100 episodes: 25.07
Episode 151 Reward: 37.0 Average over 100 episodes: 25.32
Episode 152 Reward: 43.0 Average over 100 episodes: 25.58
Episode 153 Reward: 34.0 Average over 100 episodes: 25.69
Episode 154 Reward: 67.0 Average over 100 episodes: 26.11
Episode 155 Reward: 14.0 Average over 100 episodes: 25.96
Episode 156 Reward: 68.0 Average over 100 episodes: 26.5
Episode 157 Reward: 25.0 Average over 100 episodes: 26.59
Episode 158 Reward: 16.0 Average over 100 episodes: 26.43
Episode 159 Reward: 45.0 Average over 100 episodes: 26.62
Episode 160 Reward: 31.0 Average over 100 episodes: 26.73
Episode 161 Reward: 17.0 Average over 100 episodes: 26.75
Episode 162 Reward: 11.0 Average over 100 episodes: 26.66
Episode 163 Reward: 32.0 Average over 100 episodes: 26.8
Episode 164 Rewa

Episode 289 Reward: 39.0 Average over 100 episodes: 46.67
Episode 290 Reward: 34.0 Average over 100 episodes: 46.89
Episode 291 Reward: 64.0 Average over 100 episodes: 47.2
Episode 292 Reward: 32.0 Average over 100 episodes: 47.27
Episode 293 Reward: 61.0 Average over 100 episodes: 47.58
Episode 294 Reward: 41.0 Average over 100 episodes: 47.6
Episode 295 Reward: 28.0 Average over 100 episodes: 47.59
Episode 296 Reward: 33.0 Average over 100 episodes: 47.64
Episode 297 Reward: 45.0 Average over 100 episodes: 47.76
Episode 298 Reward: 39.0 Average over 100 episodes: 47.71
Episode 299 Reward: 12.0 Average over 100 episodes: 47.52
Episode 300 Reward: 45.0 Average over 100 episodes: 46.31
Episode 301 Reward: 44.0 Average over 100 episodes: 46.46
Episode 302 Reward: 59.0 Average over 100 episodes: 46.8
Episode 303 Reward: 46.0 Average over 100 episodes: 47.0
Episode 304 Reward: 79.0 Average over 100 episodes: 47.29
Episode 305 Reward: 22.0 Average over 100 episodes: 46.76
Episode 306 Reward

Episode 432 Reward: 147.0 Average over 100 episodes: 69.02
Episode 433 Reward: 65.0 Average over 100 episodes: 69.34
Episode 434 Reward: 109.0 Average over 100 episodes: 69.81
Episode 435 Reward: 97.0 Average over 100 episodes: 70.53
Episode 436 Reward: 82.0 Average over 100 episodes: 70.59
Episode 437 Reward: 80.0 Average over 100 episodes: 70.92
Episode 438 Reward: 81.0 Average over 100 episodes: 70.47
Episode 439 Reward: 49.0 Average over 100 episodes: 70.45
Episode 440 Reward: 189.0 Average over 100 episodes: 71.53
Episode 441 Reward: 49.0 Average over 100 episodes: 71.59
Episode 442 Reward: 71.0 Average over 100 episodes: 71.72
Episode 443 Reward: 231.0 Average over 100 episodes: 72.9
Episode 444 Reward: 88.0 Average over 100 episodes: 73.1
Episode 445 Reward: 100.0 Average over 100 episodes: 73.63
Episode 446 Reward: 175.0 Average over 100 episodes: 75.04
Episode 447 Reward: 77.0 Average over 100 episodes: 75.38
Episode 448 Reward: 61.0 Average over 100 episodes: 75.62
Episode 44

Episode 571 Reward: 125.0 Average over 100 episodes: 175.82
Episode 572 Reward: 192.0 Average over 100 episodes: 175.88
Episode 573 Reward: 174.0 Average over 100 episodes: 175.98


KeyboardInterrupt: 