In [3]:
from abc import ABC, abstractmethod
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

from tqdm import tqdm

import lib  #helper functions

In [4]:
class Agent(ABC):
  def __init__(self, env: gym.Env, epsilon=0.1, alpha=0.1, gamma=1, num_tilings=8,
               tiles_per_tiling=10):
    self.env = env
    self.epsilon = epsilon  # epsilon greedy
    self.alpha = alpha  # learning rate
    self.gamma = gamma  # discount factor
    self.num_tilings = num_tilings
    self.tiles_per_tiling = tiles_per_tiling

    self.tile_coder = lib.TileCoder(self.num_tilings,
                                    self.tiles_per_tiling,
                                    list(zip(env.observation_space.low,
                                             env.observation_space.high)))

    self.thetas = None
    self.trajectory: List[Tuple['state_t', 'action_t', 'reward_t']] = list()
    self.reset()

  def z(self, state, action) -> float:
    features = self.tile_coder.encode(state)  # binary vec

    return np.sum(self.thetas[action] * features)

  def select_action(self, state, temp=1) -> int:
    z_values = np.array([self.z(state, a) for a in range(self.env.action_space.n)])
    probabilities = np.exp(z_values / temp) / np.sum(np.exp(z_values / temp))

    action = np.random.choice(self.env.action_space.n, p=probabilities)
    return action

  @abstractmethod
  def update(self, state, action, reward, next_state, next_action, done) -> None:
    pass

  def reset(self):
    # params setup
    self.thetas = np.random.uniform(-0.001, 0.001,
                                    (env.action_space.n, self.tile_coder.total_tiles))


class REINFORCE(Agent):
  def update(self, state, action, reward, next_state, next_action, done) -> None:
    if not done:
      return

    G = 0  # Store cumulative return
    for (st, at, rt) in reversed(self.trajectory):
      G = self.gamma * G + rt  # Update return
      features = self.tile_coder.encode(st)
      # Compute gradient ascent update for theta
      grad_ln_pi = features - np.sum(
        [self.thetas[a] * features for a in range(self.env.action_space.n)], axis=0)
      self.thetas[at] += self.alpha * grad_ln_pi * G

    self.trajectory = []  # Clear trajectory


class ActorCritic(Agent):
  def update(self, state, action, reward, next_state, next_action, done) -> None:
    # Get current estimate and next state estimate
    current_value = self.z(state, action)
    next_value = self.z(next_state, next_action) if not done else 0

    # Calculate TD error
    delta = reward + self.gamma * next_value - current_value

    # Update critic
    features = self.tile_coder.encode(state)
    self.thetas[action] += self.alpha * delta * features

    # Update actor
    # Assuming softmax policy, derive the gradient with respect to theta
    grad_ln_pi = features - np.sum(
      [np.exp(self.z(state, a)) * self.thetas[a] * features for a in
       range(self.env.action_space.n)], axis=0) / np.sum(
      [np.exp(self.z(state, a)) for a in range(self.env.action_space.n)])
    self.thetas[action] += self.alpha * grad_ln_pi * delta
