# Chapter 4 - Policy gradients methods
### REINFORCE method

This notebook implements a REINFORCE algorithm. The policy network accepts
state vectors as inputs and produces a (discrete) probability distribution over the possible actions.

The code and theory is based on Alexander Zai. “Deep Reinforcement Learning in Action MEAP V06”. Manning publications. 

In [6]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt
import torch.nn as nn
from torch.nn import functional as F
from collections import OrderedDict

In [7]:
env = gym.make("CartPole-v0")



In [3]:
# test environment
def testEnv(env, steps = 200):
    env.reset()
    for _ in range(steps):
        env.render()
        env.step(env.action_space.sample()) # take a random action
    env.close()

In [4]:
# Unconmment to check everything is properly installed
# A window with a cartpole randomly moving should appear. 
# Some python3-related errors might occur after closing the environment (window with the cartpole)
# Check whether kernel needs to be restarted 
 testEnv(env, 100)



In [128]:
class REINFORCE():
    def __init__(self, n_in, n_out, n_hidden1, n_hidden2, env, gamma = 0.9):
        self.n_in = n_in
        self.n_out = n_out
        self.env = env
        self.gamma = gamma
        
       # self.one_hot_reward = np.ones(arms)

        # Neural network model definition 
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(n_in, n_hidden1)),
            ('ReLu1', nn.LeakyReLU(inplace = True)),
            ('fc2', nn.Linear(n_hidden1, n_hidden2)),
            ('ReLu2', nn.LeakyReLU(inplace = True)),
            ('fc3', nn.Linear(n_hidden2, n_out)),
            ('softmax', nn.Softmax())
        ])
        )
       # self.TNetwork = copy.deepcopy(self.QNetwork) # Target Network
       # self.TNetwork.load_state_dict(self.QNetwork.state_dict())                
    def updateEnv (self, env):
        self.env = env
        
    def playEpisode(self, eps_max_dur, init_state):
        transitions_ = []
        preds_ = []
        state_ = init_state
        for st in range(eps_max_dur): 
            action_, pred_ = self.policy.step(self.model, state_, self.n_out)
            new_state_, reward_ = self.policy.execAction(self.env, action_)
            transitions_.append((state_, action_, reward_))
            preds_.append(pred_.detach().numpy())
            if done:
                break
            state_ = new_state_

        return [transitions_, preds_]
        
    def resetEnv(self):
        self.env.reset()
        
    class policy():
        
        def step(model, state, n_outs):
            pred = model(torch.from_numpy(state).float())
            action = np.random.choice(np.array(range(n_outs)), p = pred.data.numpy())
            return action, pred
            
        def execAction (env, action):
            new_state, reward, done, info = env.step(action)
            return new_state, reward
        
        def discount_rewards(rewards, disc_factor=0.99):
            # create the discount array using the disc factor and multiply it by the rewards
            disc_rewards = torch.pow(disc_factor,torch.arange(len(rewards)).float()) * rewards 
            # Normalize rewards so as to avoid drift/shift in training
            # Add small num to den so as to avoid 0 div
            norm_dreward = (disc_rewards - disc_rewards.mean()) / (disc_rewards.std() + 1e-09) 
            return norm_dreward
        
        # to train the network:
        # 1. Calculate prob of the action taken
        # 2. Apply discount factor
        # 3. backpropagate and minimize the loss

In [129]:
learning_rate = 0.0009
l1 = 4
l2 = 150
l3 = 100
l4 = 2

In [130]:
RLmodel = REINFORCE(l1,l4,l2,l3, env)
optimizer = torch.optim.Adam(RLmodel.model.parameters(), lr = learning_rate)

In [131]:
testR = torch.Tensor([4, 3, 2, 1])
RLmodel.policy.discount_rewards(testR)

tensor([ 1.1695,  0.3796, -0.3949, -1.1542])

In [132]:
def loss_fn(preds, r):
    return -1 * torch.sum(r * torch.log(preds))

In [133]:
MAX_DUR = 200
MAX_EPISODES = 500 
gamma_ = 0.99 # discount factor
RLmodel.updateEnv(gym.make("CartPole-v0")) 

for episode in range(MAX_EPISODES):
    RLmodel.resetEnv()
    done = False 
    t = 0
    obs = [] 
    actions = []
    score = []
    [history, pred_batch] = RLmodel.playEpisode(MAX_DUR, init_state)
    ep_len = len(history) 
    score.append(ep_len)
    
    # Get rewards of the episode and discount-normalize them 
    reward_batch = torch.Tensor([r for (s,a,r) in history]).flip(dims=(0,))
    disc_returns = RLmodel.policy.discount_rewards(reward_batch, gamma_)
    # Get states the system has gone through
    #pred_batch = torch.Tensor([pred for (s,a,r,pred) in history])
    # Get actions
    action_batch = torch.Tensor([a for (s,a,r) in history])
    # Store the prediction corresponding to the action taken in each step
    pred_batch = torch.from_numpy(pred_batch).float()
    prob_batch = pred_batch.gather(dim=1,index=action_batch.long().view(-1,1)).squeeze()
    
    # backpropagate and update
    loss = loss_fn(prob_batch, disc_returns)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

TypeError: expected np.ndarray (got list)