# Reinforce 
Also called as Monte Carlo Policy Gradient, it aims to find the best best policy by performing gradient ascent on the 
expected return function. It is defined as the reward-weighted sum of the probabilities of the trajectories.

## Algorithm

### Reward Function 
$R(\tau) = r_1 + r_2 + r_3 + ... + r_H+1$
### Trajectory
$\tau$ is defined as $\{s_0,a_0, s_1,a_1, s_2,a_2,...s_H,a_H,s_H+1\}$

---
We need to maximize $U(\theta) = \sum\limits_{\tau} P(\tau;\theta)R(\tau)$

In [4]:
import gym
import tensorflow as tf
import numpy as np
from collections import deque
import math

In [3]:
env = gym.make('CartPole-v0')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

# noinspection PyAbstractClass
class Policy(tf.keras.Model):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        super().__init__()
        self.a_size = a_size
        self.d1 = tf.keras.layers.Dense(h_size, activation='relu')
        self.d2 = tf.keras.layers.Dense(a_size, activation='softmax')
    
    # noinspection PyMethodOverriding
    def call(self, x):
        x = self.d1(x)
        return self.d2(x)
    
    def act(self, state):
        probs = self.call(tf.expand_dims(state))
        m = np.random.choice(self.a_size, p=np.squeeze(probs))
        return m
    

observation space: Box(4,)
action space: Discrete(2)


In [None]:
policy = Policy()
opt = tf.keras.optimizers.Adam(lr=0.01)

def reinforce(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    # noinspection PyShadowingNames
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores
    
scores = reinforce()

In [11]:
np.random.multinomial(1, [0.1,0.9])

array([0, 1])