Suhanee Kandalkar D16AD 30

# Experiment 9
Implement and analyze the Temporal Difference (TD) Learning algorithm (TD(0)) for policy evaluation in a grid-world environment

In [None]:
import numpy as np

# Grid world configuration
rows = 4
cols = 4
terminal = (3, 3)

# Actions: up, right, down, left
actions = [0, 1, 2, 3]

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 1.0   # Discount factor
num_episodes = 10000  # Number of episodes

# Initialize value function
V = np.zeros((rows, cols))
V[terminal] = 0  # Terminal state has value 0

# Define a uniform random policy (25% each action)
policy = np.full((rows, cols, len(actions)), 0.25)

def take_action(s, action):
    """Determines the next state given current state and action."""
    row, col = s
    if action == 0:  # Up
        next_row = max(row - 1, 0)
        next_col = col
    elif action == 1:  # Right
        next_col = min(col + 1, cols - 1)
        next_row = row
    elif action == 2:  # Down
        next_row = min(row + 1, rows - 1)
        next_col = col
    elif action == 3:  # Left
        next_col = max(col - 1, 0)
        next_row = row
    return (next_row, next_col)

# TD(0) Learning
for episode in range(num_episodes):
    # Start from a random non-terminal state
    current_state = terminal
    while current_state == terminal:
        current_state = (np.random.randint(0, rows), np.random.randint(0, cols))

    while current_state != terminal:
        # Choose action according to the policy
        action = np.random.choice(actions, p=policy[current_state[0], current_state[1]])
        next_state = take_action(current_state, action)

        # Determine reward
        reward = 0 if next_state == terminal else -1

        # TD(0) Update
        td_target = reward + gamma * V[next_state[0], next_state[1]]
        td_error = td_target - V[current_state[0], current_state[1]]
        V[current_state[0], current_state[1]] += alpha * td_error

        current_state = next_state

# Print the learned value function
print("Learned Value Function (V):")
print(np.round(V, decimals=1))

Learned Value Function (V):
[[-57.3 -54.3 -52.1 -50. ]
 [-55.9 -51.  -51.2 -44.3]
 [-51.2 -47.9 -37.2 -26.8]
 [-50.3 -43.1 -22.    0. ]]
