In [None]:
import numpy as np

class GridWorld:
    def __init__(self):
        self.grid_size = (3, 3)
        self.num_actions = 4  # Up, Down, Left, Right
        self.start_state = (0, 0)
        self.goal_state = (2, 2)

    def step(self, state, action):
        row, col = state
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.grid_size[0] - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.grid_size[1] - 1, col + 1)
        next_state = (row, col)
        reward = 1 if next_state == self.goal_state else 0
        return next_state, reward

class ActorCritic:
    def __init__(self, num_actions, alpha_actor, alpha_critic, gamma):
        self.num_actions = num_actions
        self.alpha_actor = alpha_actor
        self.alpha_critic = alpha_critic
        self.gamma = gamma
        self.actor_params = np.zeros((3, 3, num_actions))  # Tabular actor parameters
        self.critic_values = np.zeros((3, 3))  # Tabular critic values

    def select_action(self, state):
        # Use softmax to select an action probabilistically based on actor parameters
        row, col = state
        probabilities = self.softmax(self.actor_params[row, col])
        action = np.random.choice(self.num_actions, p=probabilities)
        return action

    def update(self, state, action, reward, next_state):
        row, col = state
        next_row, next_col = next_state

        # Compute TD error (advantage)
        td_target = reward + self.gamma * self.critic_values[next_row, next_col]
        td_error = td_target - self.critic_values[row, col]

        # Update critic values
        self.critic_values[row, col] += self.alpha_critic * td_error

        # Update actor parameters
        self.actor_params[row, col, action] += self.alpha_actor * td_error * (1 - self.softmax(self.actor_params[row, col])[action])

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

# Create a grid world environment
grid_world = GridWorld()

# Create an Actor-Critic agent
num_actions = 4  # Up, Down, Left, Right
alpha_actor = 0.1
alpha_critic = 0.1
gamma = 0.9
actor_critic_agent = ActorCritic(num_actions, alpha_actor, alpha_critic, gamma)

# Train the Actor-Critic agent
num_episodes = 1000
for _ in range(num_episodes):
    state = grid_world.start_state
    done = False
    while not done:
        action = actor_critic_agent.select_action(state)
        next_state, reward = grid_world.step(state, action)
        actor_critic_agent.update(state, action, reward, next_state)

        if next_state == grid_world.goal_state:
            done = True
        state = next_state

# Evaluate the learned policy
total_reward = 0
state = grid_world.start_state
while state != grid_world.goal_state:
    action = actor_critic_agent.select_action(state)
    print(f" State = {state}, Action = {action}")
    next_state, reward = grid_world.step(state, action)
    total_reward += reward
    state = next_state

print("Total reward obtained by learned policy:", total_reward)


 State = (0, 0), Action = 3
 State = (0, 1), Action = 1
 State = (1, 1), Action = 1
 State = (2, 1), Action = 3
Total reward obtained by learned policy: 1
