In [1]:
import numpy as np
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.height = 5
        self.width = 5
        self.actions = ['north', 'south', 'east', 'west']
        
        self.special_states = {
            (0, 1): {'next_state': (4, 1), 'reward': 10},  # A to A′
            (0, 3): {'next_state': (2, 3), 'reward': 5}    # B to B′
        }

        self.reset()

    def reset(self):
        self.current_state = (random.randint(0, 4), random.randint(0, 4))
        return self.current_state

    def get_available_actions(self):
        return self.actions

    def step(self, action):
        state = self.current_state

        # Check if current state is special (A or B)
        if state in self.special_states:
            special = self.special_states[state]
            self.current_state = special['next_state']
            return self.current_state, special['reward']
        
        # Default reward for valid move
        reward = 0
        x, y = state

        if action == 'north':
            new_state = (x - 1, y) if x > 0 else state
        elif action == 'south':
            new_state = (x + 1, y) if x < self.height - 1 else state
        elif action == 'east':
            new_state = (x, y + 1) if y < self.width - 1 else state
        elif action == 'west':
            new_state = (x, y - 1) if y > 0 else state
        else:
            raise ValueError("Invalid action")

        # If the agent hits the wall (no movement), give -1
        if new_state == state:
            reward = -1

        self.current_state = new_state
        return new_state, reward

    def render(self):
        grid = np.full((self.height, self.width), '.')
        grid[self.current_state] = 'A'
        print(grid)


In [10]:
from typing import Dict, Tuple, List

class QTable:
    def __init__(self, states: List[Tuple[int, int]], actions: List[str]):
        self.q_table = {}
        for state in states:
            self.q_table[state] = {action: 0.0 for action in actions}
    
    def get_q_value(self, state: Tuple[int, int], action: str) -> float:
        return self.q_table[state][action]
    
    def get_max_q_value(self, state: Tuple[int, int]) -> float:
        return max(self.q_table[state].values())
    
    def get_best_action(self, state: Tuple[int, int]) -> str:
        max_q = self.get_max_q_value(state)
        best_actions = [a for a, q in self.q_table[state].items() if q == max_q]
        return random.choice(best_actions)
    
    def update(self, state: Tuple[int, int], action: str, reward: float, 
               next_state: Tuple[int, int], alpha: float, gamma: float):
        current_q = self.get_q_value(state, action)
        max_next_q = self.get_max_q_value(next_state)
        new_q = current_q + alpha * (reward + gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q