reference:
http://www.scholarpedia.org/article/Policy_gradient_methods#Likelihood_Ratio_Methods_and_REINFORCE

pseudo code for finite-difference policy gradient

input: policy parameterization θh
for i=1 to  I do 
    generate policy variation Δθi
    estimate J^i≈J(θh+Δθi)=⟨∑Hk=0akrk⟩ from roll-out
    estimate J^ref , e.g., J^ref=J(θh−Δθi) from roll-out
    compute ΔJ^i≈J(θh+Δθi)−Jref
end for 
return gradient estimate gFD=(ΔΘTΔΘ)−1ΔΘTΔJ^


In [4]:
import numpy as np
from numpy.linalg import inv

random.seed(123456)

In [9]:
class State:
    
    hp = 1.0
    
    cloest_minion_dist = 0
    tower_dist = 0
    
    # game time counted in iterations or steps
    game_time = 0

state = State()    


class Policy:
    
    c_max_visible_dist = 20.0
    c_close_dist_cutoff = 1.0 / 10.0
    c_tower_safe_dist_cutoff = c_max_visible_dist / 3
    c_max_hp = 1
    c_game_time_beginning_cutoff = 0.5 *60 / 5
    
    logistic_func_x_scale = 6
    
    # theta for the action:
    # atk minion, atk tower, move to goal, retreat
    # respectively
    theta = np.zeros( (4,1) )
    theta[0] = c_close_dist_cutoff
    theta[1] = 1
    theta[2] = 0.8 / (45*60/5)
    theta[3] = 0.2
    
    
    def atkMinionPFunc(self, x):
        # map the dist to range in [0,1]
        hp_frac = x.hp / c_max_hp
        cloest_minion_dist = x.cloest_minion_dist / c_max_visible_dist
        
        p = theta[0]*hp_frac / cloest_minion_dist
        if   p > 1: return 1
        else p < 0: return 0
        else      : return p
    
    
    def atkTowerPFunc(self, tower_dist):
        # map the dist to range in [0,1]
        hp_frac = x.hp / c_max_hp
        
        #####TODO##########################
        # the range is incorrect 
        #tower_dist = x.tower_dist / c_max_visible_dist * logistic_func_x_scale
        
        #p = 1/ ( 1 + exp( -(theta[1]*hp_frac) * (tower_dist - c_tower_safe_dist_cutoff) ) )
        p = theta[1]*hp_frac*(1/( 1+exp( -2*(x+1)) ) - 1/( 1+exp( -10*(x-3)) ) )
        if   p > 1: return 1
        else p < 0: return 0
        else      : return p

        
    def mvToGoalPFunc(self, x):
        if x.game_time < c_game_time_beginning_cutoff:
            return 1
        else:
            p = theta[2] * x.game_time
            if   p > 1: return 1
            else p < 0: return 0
            else      : return p

            
    def mvRetreatPFunc(self, x):
        hp_frac = x.hp / c_max_hp
     
        p = exp( theta[3] / hp_frac ) - 1   
        if   p > 1: return 1
        else p < 0: return 0
        else      : return p
        
    
    def varPolicy(self, policy_gradient):
        return 0.01*policy_gradient*self.theta
 

    def actRollout(self, x):
        AM = self.atkMinionPFunc(x)
        AT = self.atkTowerPFunc(x)
        MG = self.mvToGoalPFunc(x)
        MR = self.mvRetreatPFunc(x)
        
        TOT = AM + AT + MG + MR
        AM_cutoff = AM / TOT
        AT_cutoff = AM + AT / TOT
        MG_cutoff = AM + AT + MG / TOT
        #MR_cutoff = MR / TOT
        
        ran = random.random()
        
        if   ran < AM_cutoff: return 0
        elif ran < AT_cutoff: return 1
        elif ran < MG_cutoff: return 2
        else                : return 3
    
    p_rollout_array = np.zeros( (4,1) )
    def pRollout(self, x):
        AM = self.atkMinionPFunc(x)
        AT = self.atkTowerPFunc(x)
        MG = self.mvToGoalPFunc(x)
        MR = self.mvRetreatPFunc(x)
        
        TOT = AM + AT + MG + MR
        p_rollout_array[0] = AM / TOT
        p_rollout_array[1] = AT / TOT
        p_rollout_array[2] = MG / TOT
        p_rollout_array[3] = MR / TOT
        return p_rollout_array
        
policy = Policy()
        
print("class template set.")

class template set.


In [None]:
def retrieveStepReward(state, policy):
    static 
    # expected to implement these:
    #
    # sample an action based on the given policy
    # execute the action
    # retrieve the reward
    
    # for now we will just sample from random number for testing purpose
    #
    reward = np.randn( policy.rollout() )
    
    # artificial reward for
    # attack minion, attack tower, move to base, move to retreat
    np.dot( policy.pRollout(),  retrieveStepReward.std_reward )
    
    return    
retrieveStepReward.std_reward = np.array(1, 10, 20, 20)

In [None]:
def updateR_i(policy_grad, delta_theta):
    R_i = updateR_i.R_i
    a = updateR_i.a
    
    R_i = (1-a)*R_i + a*np.dot(policy_grad, delta_theta)
    updateR_i.R_i = R_i
    return R_i
    
updateR_i.R_i = 1
updateR_i.a = 0.1

In [None]:
def updateR_ref(reward):
    R_ref = updateR_ref.R_ref
    a = updateR_ref.a
    
    R_ref = (1-a)*R_ref + a*reward
    updateR_ref = R_ref
    return R_ref

updateR_ref.R_ref = 1
updateR_ref.a = 0.1

In [None]:
def policyGradByFD(policy, c_numSamples = 10)
""" This function returns the estimate of policy gradient 
    calculated by finite difference.
    
    Expect:
        c_numPolicyParm: a global variable, number of policy parameter
    
    Args:
        theta: current policy parameter, numpy.ndarray of size (n, 1)
        c_numSamples: int. Number of samples to take (or equivalently number of 
                      policy parameter perturbation to make)

    Returns:
        policyGrad: policy gradient estimate, numpy.ndarray of size (n, 1)
                    calculated using least square formula
                    x = (A^T A)A^T b

"""

c_numPolicyParm = policy.theta.size()

# preallocate vectors
delta_theta = np.zeros( (c_numSamples, c_numPolicyParm) )
delta_R = np.zeros( (c_numSamples, 1) )

for i in range(c_numSamples):
    [theta_i, delta_theta_i] = policy.varPolicy()
    r=retrieveStepReward()
    updateR_i(policy_grad, delta_theta_i)    # currently by running mean
    updateR_ref(r)  # currently by running mean
    delta_R_i = R_i - R_ref
    
    delta_theta[i] = delta_theta_i
    delta_R[i] = delta_R_i
    

policy_grad = inv(delta_theta.T * delta_theta) * delta_theta.T * delta_R
policyGradByFD.policy_grad = policy_grad
return policy_grad

policyGradByFD.policy_grad = np.array([1, 1, 1, 1])    
    

need to estimate J for policy gradient to work

J will be estimated using TD(0)