In [1]:
import numpy as np

class GridWorld:
    def __init__(self, rows, cols, terminal_states):
        self.rows = rows
        self.cols = cols
        self.terminal_states = terminal_states
        self.actions = ['up', 'down', 'left', 'right']
        self.state = (0, 0)  # Starting at the top-left corner

    def step(self, action):
        if self.state in self.terminal_states:
            return self.state, 0  # No reward after reaching terminal state

        row, col = self.state
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.rows - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.cols - 1)

        self.state = (row, col)
        reward = -1  # Each step incurs a reward of -1
        return self.state, reward

    def reset(self):
        self.state = (0, 0)
        return self.state


In [2]:
def initialize_value_function(rows, cols):
    return np.zeros((rows, cols))


In [3]:
def td_prediction(env, value_function, policy, alpha=0.1, gamma=1.0, episodes=1000):
    for _ in range(episodes):
        state = env.reset()
        while state not in env.terminal_states:
            action = policy(state)
            next_state, reward = env.step(action)
            # TD Update
            value_function[state] += alpha * (reward + gamma * value_function[next_state] - value_function[state])
            state = next_state
    return value_function


In [4]:
import random

def random_policy(state):
    return random.choice(['up', 'down', 'left', 'right'])


In [5]:
# Define grid world dimensions and terminal states
rows, cols = 4, 4
terminal_states = [(3, 3)]  # Bottom-right corner as terminal state

# Initialize environment and value function
env = GridWorld(rows, cols, terminal_states)
value_function = initialize_value_function(rows, cols)

# Perform TD prediction
estimated_values = td_prediction(env, value_function, random_policy, alpha=0.1, episodes=1000)

# Display the estimated value function
print(np.round(estimated_values, 2))


[[-56.43 -53.55 -51.47 -50.15]
 [-54.36 -51.79 -42.28 -39.91]
 [-51.34 -47.43 -36.91 -20.38]
 [-48.9  -37.08 -20.44   0.  ]]
