#Prasad Jawale RL Experiment 6 - Value Iteration


In [None]:
import numpy as np

class GridWorldMDP:
    def __init__(self, num_rows, num_cols, terminal_states, rewards, gamma=0.9):
        self.num_rows = num_rows
        self.num_cols = num_cols
        self.terminal_states = terminal_states
        self.rewards = rewards
        self.gamma = gamma
        self.num_states = num_rows * num_cols
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up
        self.transition_prob = self._initialize_transition_prob()

    def _initialize_transition_prob(self):
        transition_prob = np.zeros((self.num_states, len(self.actions), self.num_states))
        for state in range(self.num_states):
            for action_index, action in enumerate(self.actions):
                next_state, _ = self._get_next_state(state, action)
                transition_prob[state, action_index, next_state] = 1
        return transition_prob

    def _get_next_state(self, state, action):
        row, col = divmod(state, self.num_cols)
        next_row = max(0, min(row + action[0], self.num_rows - 1))
        next_col = max(0, min(col + action[1], self.num_cols - 1))
        next_state = next_row * self.num_cols + next_col
        if (next_row, next_col) in self.terminal_states:
            return next_state, self.rewards[(next_row, next_col)]
        return next_state, 0

    def value_iteration(self, theta=0.0001):
        V = np.zeros(self.num_states)
        while True:
            delta = 0
            for state in range(self.num_states):
                v = V[state]
                action_values = []
                for action_index, action in enumerate(self.actions):
                    next_state, reward = self._get_next_state(state, action)
                    action_values.append(reward + self.gamma * V[next_state])
                V[state] = max(action_values)
                delta = max(delta, abs(v - V[state]))
            if delta < theta:
                break

        policy = np.zeros((self.num_states, len(self.actions)))
        for state in range(self.num_states):
            action_values = []
            for action_index, action in enumerate(self.actions):
                next_state, reward = self._get_next_state(state, action)
                action_values.append(reward + self.gamma * V[next_state])
            best_action = np.argmax(action_values)
            policy[state, best_action] = 1

        return policy, V



num_rows = 5
num_cols = 3
terminal_states = {(0, 0): 1, (4, 2): 1}
rewards = {(0, 0): 1, (4, 2): 1}
grid_world = GridWorldMDP(num_rows, num_cols, terminal_states, rewards)

# Value iteration
optimal_policy, optimal_values = grid_world.value_iteration()

# Displaying the optimal policy
actions_str = ['R', 'L', 'D', 'U']
for state in range(grid_world.num_states):
    row = state // num_cols
    col = state % num_cols
    print(f"State ({row}, {col}): Action {actions_str[np.argmax(optimal_policy[state])]} Optimal Value: {optimal_values[state]}")


State (0, 0): Action L Optimal Value: 9.999153585021714
State (0, 1): Action L Optimal Value: 9.999238226519543
State (0, 2): Action L Optimal Value: 8.999314403867588
State (1, 0): Action U Optimal Value: 9.999238226519543
State (1, 1): Action L Optimal Value: 8.999314403867588
State (1, 2): Action L Optimal Value: 8.09938296348083
State (2, 0): Action U Optimal Value: 8.999314403867588
State (2, 1): Action L Optimal Value: 8.09938296348083
State (2, 2): Action D Optimal Value: 8.999165052157794
State (3, 0): Action U Optimal Value: 8.09938296348083
State (3, 1): Action R Optimal Value: 8.999165052157794
State (3, 2): Action D Optimal Value: 9.999165052157794
State (4, 0): Action R Optimal Value: 8.999165052157794
State (4, 1): Action R Optimal Value: 9.999165052157794
State (4, 2): Action R Optimal Value: 9.999165052157794
