In [4]:
import numpy as np

In [7]:
class GridWorld:
    def __init__(self):
        self.grid_size = (3, 4)
        self.num_actions = 4  # Up, Down, Left, Right
        self.rewards = np.array([
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 1]  # Reward of +1 in the bottom-right cell
        ])
        self.start_state = (2, 0)

    def step(self, state, action):
        # Define the dynamics of the environment
        row, col = state
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.grid_size[0] - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.grid_size[1] - 1, col + 1)

        next_state = (row, col)
        reward = self.rewards[row, col]
        return next_state, reward


In [8]:
def generate_episode(grid_world):
    """Generate a single episode using a random policy."""
    episode = []
    state = grid_world.start_state

    while state != (2, 3):  # Until reaching the bottom-right cell
        action = np.random.choice(grid_world.num_actions)  # Random action
        next_state, reward = grid_world.step(state, action)
        episode.append((state, action, reward))
        state = next_state

    return episode

def monte_carlo(grid_world, num_episodes, gamma=1.0):
    """Monte Carlo policy evaluation to estimate the state-value function."""
    V = np.zeros(grid_world.grid_size)  # Initialize state-value function
    returns = {}  # Store returns for each state

    for episode in range(num_episodes):
        episode_data = generate_episode(grid_world)
        G = 0  # Initialize return

        # Traverse the episode in reverse to calculate returns
        for t in reversed(range(len(episode_data))):
            state, _, reward = episode_data[t]
            G = gamma * G + reward  # Discounted return

            # Check if it's the first occurrence of the state in the episode
            if state not in [(ep[0]) for ep in episode_data[:t]]:
                if state not in returns:
                    returns[state] = []
                returns[state].append(G)
                V[state] = np.mean(returns[state])  # Update state value

    return V

# Create a grid world environment
grid_world = GridWorld()

# Run Monte Carlo to estimate the state-value function
num_episodes = 1000
V = monte_carlo(grid_world, num_episodes)

# Print the estimated state-value function
print("Estimated State-Value Function:")
print(V)

Estimated State-Value Function:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 0.]]
