In [136]:
import tensorflow as tf
import gym
import numpy as np
import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')


In [137]:
env = gym.make("CartPole-v1")

In [138]:
SEED = 42
tf.random.set_seed(SEED)

In [139]:
LAYERS = [
    tf.keras.layers.Dense(5,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid') # left prob # 1 < LEFT, O > RIGHT
]

model = tf.keras.Sequential(LAYERS)

In [140]:
def pg_policy(observation, model): # pg = policy gradient
    left_probability = model.predict(observation[np.newaxis]) # probability value which lies between 0 and 1
    action = int(np.random.rand() > left_probability) # Value will be 0 or 1 # explorations vs exploitation concept
    return action 


# Policy Gradients

Optimize learnable parameters of policy by following the gradients towards higher reward (maximizing reward)

## steps

    let the NN play the game multiple times and at every step just calculate the gradients (wrt reward) but dont apply it immidiately.
    Once you have completed several episodes then compute the actions using discounted method.
    result of previous step 2 can +ve or -ve



In [141]:
tf.random.uniform([1,1])

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.6645621]], dtype=float32)>

In [142]:
a = np.array([1,2])
a[np.newaxis]
a

array([1, 2])

In [143]:
def play_one_step(env, observation, model, loss_fn):
    with tf.GradientTape() as tape:
        left_prabability = model(observation[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_prabability) # True and False
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32) # 
        loss = tf.reduce_mean(loss_fn(y_target, left_prabability)) 

    grads = tape.gradient(loss, model.trainable_variables) # dc/dw
    print(int(action))
    new_observation, reward, done, info, e = env.step(int(action))
    print(new_observation)
    print(reward)
    print(done)
    print(info)
    print(e)
    return new_observation, reward, done, grads

In [144]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = list()
    all_grads = list()
    for episode in range(n_episodes):
        current_rewards = list()
        current_grads = list()
        observation = env.reset()
        observation = observation[0]
        for step in range(n_max_steps):
            observation, reward, done, grads = play_one_step(env, observation, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads


In [145]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    N = len(rewards)
    for step in range(N - 2, -1, -1):
        # a_n + a_n+1*gamma
        discounted[step] = discounted[step] + discounted[step + 1] * discount_factor
    return discounted

In [146]:

x = np.array([1,2])
np.concatenate([x,x])

array([1, 2, 1, 2])

In [147]:
def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = list()
    for reward in all_rewards:
        # discounted rewards
        drs = discount_rewards(reward, discount_factor)
        all_discounted_rewards.append(drs)

    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()

    normalize_rewards = list()
    for discounted_rewards in all_discounted_rewards:
        nrs = (discounted_rewards - reward_mean) / reward_std
        normalize_rewards.append(nrs)
    return normalize_rewards

In [148]:


n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95
learning_rate = 0.01



In [149]:
obs = env.reset(seed=SEED)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.binary_crossentropy

In [150]:
r1 = [1,2,3]
r2 = [-1,-2,3]
all_rewards_1 = [r1, r2]
list(map(sum, all_rewards_1))

[6, 0]

In [151]:
sum(map(sum, all_rewards_1))

6

In [152]:
arr = [[1,2,3], [3,4,5]]
tf.reduce_mean(arr, axis=0)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 3, 4], dtype=int32)>

In [153]:


for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn
    )
    total_rewards = sum(map(sum, all_rewards))
    print(f"Iteration: {iteration + 1}/{n_iterations}",
    f"mean rewards: {total_rewards/n_episodes_per_update}"
    )
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)

    all_mean_grads = list()
    # Weight of 5 hidden nodes, bias for 5 nodes, w for output node, bias for output node
    N = len(model.trainable_variables)
    for var_index in range(N):
        temp_reduce_mean = list()
        for episode_index, final_rewards in enumerate(all_final_rewards): # rewards for every episode
            for step, final_reward in enumerate(final_rewards): # several steps
                result = final_reward * all_grads[episode_index][step][var_index]
                temp_reduce_mean.append(result)
        mean_grads = tf.reduce_mean(temp_reduce_mean, axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))



1


ValueError: too many values to unpack (expected 4)

In [None]:


import re
import time

unique_name = re.sub(r"[\s+:]", "_", time.asctime())
model_name = f"model_at_{unique_name}_.h5"
model.save(model_name)
print(f"model is saved as '{model_name}'")

