In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import gym

In [2]:
class Pi(tf.keras.Model):
    def __init__(self, action_dim = 1):
        super(Pi, self).__init__()
        self.dense_1 = tf.keras.layers.Dense(64, 'relu')
        self.dense_2 = tf.keras.layers.Dense(64, 'relu')
        self.dense_3 = tf.keras.layers.Dense(action_dim, 'softmax')
        
    def call(self, inputs):
        x = self.dense_1(inputs)
        x = self.dense_2(x)
        return self.dense_3(x)
    
    def process(self, observations):
        # Process batch observations using `call(x)` behind-the-scenes
        action_probabilities = self.predict_on_batch(observations)
        return action_probabilities

In [3]:
class Agent():
    def __init__(self, action_dim = 1):
        self.policy_net = Pi(action_dim = action_dim)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
        self.gamma = 0.99
        
        self.onpolicy_reset()
    
    def onpolicy_reset(self):
        self.rewards = []
        self.actions = []
        self.states = []
        
    def policy(self, state):
        state = state.reshape(1, -1)
        state = tf.convert_to_tensor(state, dtype = tf.float32)
        action_logits = self.policy_net(state)
        action = tf.random.categorical( tf.math.log(action_logits), num_samples = 1 )
        return action
    
    def act(self, state):
        action = self.policy(state).numpy()
        return action.squeeze()
    
    def learn(self):
        discounted_reward = 0
        discounted_rewards = []
        for r in self.rewards:
            discounted_reward = r + self.gamma * discounted_reward
            discounted_rewards.append(discounted_reward)
        discounted_rewards.reverse()
        
        for state, reward, action in zip(self.states, discounted_rewards, self.actions):
            with tf.GradientTape() as tape:
                action_probabilities = self.policy_net(np.array([state]), training = True)
                loss = self.loss(action_probabilities, action, reward)
            grads = tape.gradient(loss, self.policy_net.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.policy_net.trainable_variables))
                
    def loss(self, action_probabilities, action, reward):
        dist = tfp.distributions.Categorical(
            probs = action_probabilities, dtype = tf.float32
        )
        log_prob = dist.log_prob(action)
        loss = -log_prob * reward
        return loss
        

In [None]:

env = gym.make('CartPole-v0')
#env = gym.make('MountainCar-v0')
action_dim = env.action_space.n # 2 for the cartpole
agent = Agent(action_dim = action_dim)
render = True

for episode in range(300):
    state = env.reset()
    total_reward = 0
    #done = False
    
    for step in range(200):
    #while not done:
        action = agent.act(state)
        # Append the state before applying the action
        agent.states.append(state)
        agent.actions.append(action)
        
        state, reward, done, _ = env.step(action)
        agent.rewards.append(reward)
        
        total_reward += reward
        
        
        if render:
            env.render()
        
        if done:
            agent.learn()
            agent.onpolicy_reset()
            print("\n")
            break
        print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")
env.close()

Episode#:0 ep_reward:19.0

Episode#:1 ep_reward:16.0

Episode#:2 ep_reward:18.0

Episode#:3 ep_reward:28.0

Episode#:4 ep_reward:14.0

Episode#:5 ep_reward:26.0

Episode#:6 ep_reward:17.0

Episode#:7 ep_reward:20.0

Episode#:8 ep_reward:11.0

Episode#:9 ep_reward:11.0

Episode#:10 ep_reward:17.0

Episode#:11 ep_reward:9.0

Episode#:12 ep_reward:37.0

Episode#:13 ep_reward:33.0

Episode#:14 ep_reward:19.0

Episode#:15 ep_reward:34.0

Episode#:16 ep_reward:39.0

Episode#:17 ep_reward:38.0

Episode#:18 ep_reward:30.0

Episode#:19 ep_reward:15.0

Episode#:20 ep_reward:17.0

Episode#:21 ep_reward:19.0

Episode#:22 ep_reward:22.0

Episode#:23 ep_reward:14.0

Episode#:24 ep_reward:15.0

Episode#:25 ep_reward:10.0

Episode#:26 ep_reward:26.0

Episode#:27 ep_reward:42.0

Episode#:28 ep_reward:25.0

Episode#:29 ep_reward:63.0

Episode#:30 ep_reward:55.0

Episode#:31 ep_reward:48.0

Episode#:32 ep_reward:38.0

Episode#:33 ep_reward:14.0

Episode#:34 ep_reward:19.0

Episode#:35 ep_reward:15.0

Epi