# Actor-Critic for CartPole-v1

### Tile Coding

First, we implement basic tile coding to map the continuous state space of the CartPole-v1 task to a feature vector. If the `lower_bounds` parameter consists of $n$ floats $a_0, \dots, a_{n-1}$, the `upper_bounds` parameter consists of $n$ floats $b_0, \dots, b_{n-1}$, and the `divisions` parameter consists of $n$ positive integers $d_0, \dots, d_{n-1}$, then we tile the space $[a_0, b_0) \times \cdots \times [a_{n-1}, b_{n-1})$ uniformly by dividing the $j^\mathrm{th}$ dimension $[a_j, b_j)$ into $d_j$ equal half-open intervals. Suppose $k_0, ..., k_{n-1}$ are the zero-indexed indices of the intervals in which a point $p$ lies. That is,
$$p_j \in \bigg[a_j + k_j \frac{b_j - a_j}{d_j}, a_j + (k_j + 1) \frac{b_j - a_j}{d_j}\bigg)$$
for each $j$. Then the corresponding feature vector is the $d_0 \cdots d_{n-1}$-component vector consisting of all zeros except for a one at index
$$\sum_{j=0}^{n-1} (d_{j+1} \cdots d_{n-1}) k_j.$$

Note also that if $p_j < a_j$, then we set $k_j = 0$, and if $p_j \geq b_j$, then we set $k_j = d_j - 1$.

In [46]:
import numpy as np

class TileCoder:
    def __init__(self, lower_bounds, upper_bounds, divisions):
        self.lower_bounds = lower_bounds
        self.upper_bounds = upper_bounds
        self.divisions = divisions
    
    def make_feature_vector(self, point):
        n = len(self.lower_bounds)
        point = np.clip(point, self.lower_bounds, self.upper_bounds)
        k = np.floor((point - self.lower_bounds) * (self.upper_bounds - self.lower_bounds) / self.divisions)
        m = round(np.sum([
            np.prod([
                self.divisions[k]
                for k in range(j + 1, n - 1)
            ]) * k[j]
            for j in range(n - 1)
        ]))
        x = np.zeros(np.prod(self.divisions))
        x[m] = 1
        return x

In [47]:
tc = TileCoder(
    np.array([-4.8, -3, -0.418, -3]),
    np.array([+4.8, +3, +0.418, +3]),
    np.array([10, 10, 10, 10])
)

### Actor-Critic with Eligibility Traces

In [53]:
import random

action_space = [0, 1]

def choose_action(policy, feature_vector, policy_parameters):
    random.choices(
        action_space,
        weights=policy(feature_vector, policy_parameters)
    )[0]

def actor_critic_episode(
    env,
    policy, state_value,
    policy_parameters, state_value_parameters,
    log_policy_grad, state_value_grad,
    learning_rate_actor, learning_rate_critic,
    trace_decay_actor, trace_decay_critic
):
    z_actor = np.zeros_like(policy_parameters)
    z_critic = np.zeros_like(state_value_parameters)
    
    state = env.reset()
    feature_vector = tc.make_feature_vector(state)
    
    while True:
        action = choose_action(policy, feature_vector, policy_parameters)
        next_state, reward, done, _ = env.step(action)
        next_feature_vector = tc.make_feature_vector(next_state)
        
        z_actor = trace_decay_actor * z_actor + log_policy_grad(action, feature_vector, policy_parameters)
        z_critic = trace_decay_critic * z_actor + state_value_grad(feature_vector, state_values)
        
        delta = reward + state_value(next_feature_vector, state_value_parameters) - state_value(feature_vector, state_value_parameters)
        
        policy_parameters += learning_rate_actor * delta * z_actor
        state_value_parameters += learning_rate_critic * delta * z_critic
        
        state = next_state
        feature_vector = next_feature_vector
    
    return policy_parameters, state_value_parameters

def actor_critic(
    env,
    policy, state_value,
    policy_parameters, state_value_parameters,
    log_policy_grad, state_value_grad,
    learning_rate_actor, learning_rate_critic,
    trace_decay_actor, trace_decay_critic,
    num_episodes=10
):
    for episode in range(episodes):
        policy_parameters, state_value_parameters = actor_critic_episode(
            env,
            policy, state_value,
            policy_parameters, state_value_parameters,
            log_policy_grad, state_value_grad,
            learning_rate_actor, learning_rate_critic,
            trace_decay_actor, trace_decay_critic
        )
    
    return policy_parameters, state_value_parameters

### Training

In [None]:
def policy(feature_vector, policy_parameters):
    pref_matrix = policy_parameters.reshape(len(action_space), -1)
    prefs = M.dot(feature_vector)
    exps = np.exp(prefs)
    softmax = exp / np.sum(exps)
    return softmax

def log_policy_grad(action, feature_vector, policy_parameters):
    pass

def state_value(feature_vector, state_value_parameters):
    pass

def state_value_grad(feature_vector, state_value_parameters):
    pass