In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Model

from tensorflow import keras

#tf.enable_eager_execution()

from tensorflow.keras.layers import Conv1D, Flatten

from baselines.common.atari_wrappers import make_atari, wrap_deepmind



# Actor-Critic model

In [3]:
seed=123
gamma=0.99
max_steps_per_episode=1000
env=gym.make('CartPole-v0')
env.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [42]:
env.action_space

Discrete(2)

In [43]:
env.reward_range

(-inf, inf)

In [44]:
env.observation_space

Box(4,)

In [45]:
env.spec

EnvSpec(CartPole-v0)

In [46]:
env.metadata

{'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}

In [58]:
env.reset()
env.step(env.action_space.sample())

(array([ 0.17472449, -0.98461736, -1.5274704 ]),
 -1.8011789172338466,
 False,
 {})

In [48]:
env.action_space.sample()

1

In [67]:
env.action_space.high

array([2.], dtype=float32)

In [54]:
env.observation_space.high

array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)

In [53]:
env.observation_space.low

array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32)

In [4]:
num_inputs=4
num_actions=2
num_hidden=128

inputs=Input((num_inputs,))
hidden=Dense(num_hidden,activation='relu')(inputs)
action=Dense(num_actions,activation='softmax')(hidden)
critic=Dense(1)(hidden)
model=Model(inputs=inputs,outputs=[action,critic])

In [4]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          640         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 2)            258         dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1)            129         dense[0][0]                      
Total params: 1,027
Trainable params: 1,027
Non-trainable params: 0
__________________________

In [64]:
tf.print(action_probs)

[[0.122754373]]


In [74]:
optimizer=keras.optimizers.Adam(learning_rate=0.01)
huber_loss=keras.losses.Huber()
action_probs_history=[]
critic_value_history=[]
rewards_history=[]
running_reward=0
episode_count=0

while True:
    state=env.reset()
    episode_reward=0
    with tf.GradientTape() as tape:
        for timestep in range(1,max_steps_per_episode):
            state=tf.convert_to_tensor(state)
            state=tf.expand_dims(state,0)
            
            action_probs,critic_value=model(state)
            critic_value_history.append(critic_value[0,0])
            
            
            action=np.random.choice(num_actions,p=np.squeeze(action_probs))
            
            action_probs_history.append(tf.math.log(action_probs[0,action]))
            
            env.render()
            state,reward,done,info=env.step(action)
            rewards_history.append(reward)
            episode_reward+=reward
            
            if done:
                break
                
        running_reward=0.05*episode_reward+(1-0.05)*running_reward
        
        returns=[]
        discounted_sum=0
        for r in rewards_history[::-1]:
            discounted_sum=r+gamma*discounted_sum
            returns.insert(0,discounted_sum)
        
        returns=np.array(returns)
        returns=(returns-np.mean(returns))/(np.std(returns)+eps)
        returns=returns.tolist()
        history=zip(action_probs_history,critic_value_history,returns)
        actor_losses=[]
        critic_losses=[]
        for log_prob,value,ret in history:
            diff=ret-value
            actor_losses.append(-log_prob*diff)
            critic_losses.append(huber_loss(tf.expand_dims(value,0),tf.expand_dims(ret,0)))
            
        losses_value=sum(actor_losses)+sum(critic_losses)
        grads=tape.gradient(losses_value,model.trainable_variables)
        optimizer.apply_gradients(zip(grads,model.trainable_variables))
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
    
    episode_count+=1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break


running reward: 80.25 at episode 10
running reward: 128.30 at episode 20
running reward: 157.07 at episode 30
running reward: 174.30 at episode 40
running reward: 184.61 at episode 50
running reward: 190.79 at episode 60
running reward: 194.48 at episode 70
Solved at episode 72!


In [75]:
env.close()