## Import Dependencies

In [1]:
import gym
import os
import numpy as np

import tensorflow as tf
import tensorflow.keras as ks

## Create Environment

In [2]:
env = gym.make("LunarLander-v2")

In [3]:
num_actions = env.action_space.n
num_inputs = env.observation_space.shape

## Define the REINFORCE Network

A simple implementation of this algorithm would involve creating a Policy: a model that takes a state as input and **generates the probability of taking an action** as output. 

A policy guides the agent telling it what action to take at each state. The policy is then iterated on and tweaked slightly at each step until we get a policy that solves the environment.

In [15]:
def get_reinforce(input_shape, num_actions):
    
    inputs = ks.layers.Input(input_shape)
    
    x = ks.layers.Dense(128, activation='relu')(inputs)
    x = ks.layers.Dense(128, activation='relu')(x)
    
    outputs = ks.layers.Dense(num_actions, 'softmax')(x)
    
    return ks.Model(inputs, outputs)

## Training

The steps involved in the implementation of REINFORCE would be as follows:

1. Initialize a **Random Policy** (a NN that takes the state as input and returns the probability of actions).
2. Use the policy to **play N steps of the game** — record action probabilities (from policy), reward (from environment), action (from agent).
3. **Calculate the discounted reward** for each step by backpropagation.
4. **Calculate expected reward** G.
5. **Adjust weights** of Policy (back-propagate error in NN) to increase G.
6. Repeat from 2.

In [16]:
# PARAMETERS
episodes = 1000
max_steps_per_episode = 10000

gamma = 0.99
alpha = 1e-2

load_model = False
load_path = os.path.join('Saved_Models', 'LunarLander_REINFORCE_1')

save_model = False
save_path = os.path.join('Saved_Models', 'LunarLander_REINFORCE_2')

In [19]:
# Create the model
model = get_reinforce(num_inputs, num_actions)

# Optimizer selection
optimizer = ks.optimizers.Adam(learning_rate=alpha)

if load_model is True:
    model.load_weights(load_path)

action_prob_hist = []
action_hist = []
reward_hist = []

running_reward = 0
episode = 1

while True:
    
    state = env.reset()
    done = False
    
    episode_reward = 0
    
    with tf.GradientTape() as tape:
        # EPISODE GENERATION
        for step in range(1, max_steps_per_episode):
            
            #env.render()
            
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            action_probs = model(state)
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))

            state, reward, done, _ = env.step(action)

            episode_reward += reward

            if done:
                break

            action_prob_hist.append(tf.math.log(action_probs[0, action]))
            action_hist.append(action)
            reward_hist.append(reward)  
        
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        returns = []
        G = 0
        for r in reward_hist[::-1]:
            G = r + gamma * G 
            returns.insert(0, G)

        # Normalization
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / np.std(returns)
        returns = returns.tolist()

        # UPDATE POLICY
        history = zip(action_prob_hist, reward_hist)
        losses = []
        for log_prob, rew in history:
            losses.append(-log_prob * rew)

        loss_value = sum(losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_prob_hist.clear()
        action_hist.clear()
        reward_hist.clear()

        if episode % 10 == 0:
            print(f'Running reward: {running_reward} (epsiode {episode})')
            
            if save_model is True:
                model.save_weights(save_path)
        
        if running_reward > 195:  # Condition to consider the task solved
            print(f"Solved at episode {episode}!")
            
            if save_model is True:
                model.save_weights(save_path)
                
            break
            
        episode += 1
        
#env.close()

Running reward: -76.33529553681956 (epsiode 10)
Running reward: -108.12131583540261 (epsiode 20)
Running reward: -202.22965426387276 (epsiode 30)
Running reward: -191.29892265866852 (epsiode 40)
Running reward: -139.1466327814471 (epsiode 50)
Running reward: -143.58864288225996 (epsiode 60)
Running reward: -102.92698041496942 (epsiode 70)
Running reward: -130.395686159895 (epsiode 80)
Running reward: -111.8149292348624 (epsiode 90)
Running reward: -91.78967488444806 (epsiode 100)
Running reward: -78.82487885655614 (epsiode 110)
Running reward: -62.65056542430481 (epsiode 120)
Running reward: -73.659972655785 (epsiode 130)
Running reward: -78.51299557056288 (epsiode 140)
Running reward: -50.01548222856215 (epsiode 150)
Running reward: -70.06813464236625 (epsiode 160)
Running reward: -60.9790433276316 (epsiode 170)
Running reward: -57.47701732203816 (epsiode 180)
Running reward: -47.02098232408225 (epsiode 190)
Running reward: -56.50535369990282 (epsiode 200)
Running reward: -76.06417932


KeyboardInterrupt



In [20]:
for episode in range(10):
    
    state = env.reset()
    done = False
    
    for step in range(1, max_steps_per_episode):

        env.render()    
        
        state = tf.convert_to_tensor(state)
        state = tf.expand_dims(state, 0)

        action_probs = model(state)
        action = np.random.choice(num_actions, p=np.squeeze(action_probs))

        state, reward, done, _ = env.step(action)

        if done:
            break       
            
env.close()


KeyboardInterrupt

