In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(17)
tf.random.set_seed(17)




In [2]:
env = gym.make('CartPole-v0')
env.seed(17)

  logger.warn(


[17]

In [3]:
env.action_space

Discrete(2)

In [5]:
num_inputs = 4
num_actions = 2

num_hidden = 64

inputs = layers.Input(shape=(num_inputs,))
common_1 = layers.Dense(num_hidden, activation="relu")(inputs)
common_2 = layers.Dense(num_hidden, activation="relu")(common_1)
action = layers.Dense(num_actions, activation="softmax")(common_2)
critic = layers.Dense(1)(common_2)

model = keras.Model(inputs=inputs, outputs=[action, critic])

In [6]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss = keras.losses.Huber()

In [7]:
gamma = 0.995
max_steps_per_episode = 1000
eps = np.finfo(np.float32).eps.item()

action_probs_history = []
critic_value_history = []
rewards_history = []

running_reward = 0
episode_count = 0

while True:
    state = env.reset()
    episode_reward = 0
    
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            env.render()
            
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])
            
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))
            
            state, reward, done, _ = env.step(action)
            
            rewards_history.append(reward)
            episode_reward += reward
            
            if done:
                break
            
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        returns = np.array(returns)
        returns = (returns - np.mean(returns))/(np.std(returns) + eps)
        returns = returns.tolist()

        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses =[]

        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob * diff)
            critic_losses.append(loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
    
    episode_count +=1
    
    if episode_count % 10 == 0:
        print(f'Episode: {episode_count}  running_reward: {running_reward:.2f}')
        
    if running_reward > 195:
        print(f"Solved at episode {episode_count}!")
        break

env.close()    

Episode: 10  running_reward: 8.45
Episode: 20  running_reward: 11.34
Episode: 30  running_reward: 13.18
Episode: 40  running_reward: 18.95
Episode: 50  running_reward: 21.21
Episode: 60  running_reward: 43.92
Episode: 70  running_reward: 74.22
Episode: 80  running_reward: 111.04
Episode: 90  running_reward: 107.12
Episode: 100  running_reward: 83.94
Episode: 110  running_reward: 87.98
Episode: 120  running_reward: 115.91
Episode: 130  running_reward: 128.33
Episode: 140  running_reward: 117.30
Episode: 150  running_reward: 129.44
Episode: 160  running_reward: 137.00
Episode: 170  running_reward: 161.71
Episode: 180  running_reward: 147.79
Episode: 190  running_reward: 135.06
Episode: 200  running_reward: 140.24
Episode: 210  running_reward: 121.26
Episode: 220  running_reward: 129.15
Episode: 230  running_reward: 132.02
Episode: 240  running_reward: 124.40
Episode: 250  running_reward: 115.77
Episode: 260  running_reward: 110.43
Episode: 270  running_reward: 119.86
Episode: 280  runnin

In [13]:
env = gym.make('CartPole-v0')
max_test_timestaps = 100000
test_episodes = 10

for ep in range(test_episodes):
    state = env.reset()
    for timestap in range(max_test_timestaps):
        env.render()
        
        state = tf.convert_to_tensor(state)
        state = tf.expand_dims(state, 0)
        
        action_probs, _ = model(state)
        
        if action_probs[0][0].numpy() >=0.5:
            action = 0
        else:
            action = 1
            
        state, _, done, _ =env.step(action)
        
        if done:
            break
env.close()