In [None]:
import numpy as np

# =============================
# (i)  Gridworld Wrapper
# =============================
class GridworldEnv:
    def __init__(self, grid_size=(4, 4), start=(0, 0), goal=(3, 3), slip_prob=0.05):
        self.grid_size = grid_size
        self.start = start
        self.goal = goal
        self.state = start
        self.actions = ['up', 'down', 'left', 'right']
        self.action_dict = {
            'up': (-1, 0),
            'down': (1, 0),
            'left': (0, -1),
            'right': (0, 1)
        }
        self.slip_prob = slip_prob

    def reset(self):
        self.state = self.start
        return self.state

    def get_transition_distribution(self, state, action):
        """
        Returns a list of dictionaries representing the probability distribution over outcomes.
        Each outcome dict contains:
          - 'next_state': the state after the transition
          - 'reward': the immediate reward received
          - 'done': whether this outcome terminates the episode
          - 'probability': the probability of this outcome

        The dynamics are:
          - With probability (1 - slip_prob): the agent moves as intended.
          - With probability slip_prob: the agent stays in the same state.
        """
        if action not in self.actions:
            raise ValueError("Invalid action!")

        # Compute intended next state
        movement = self.action_dict[action]
        intended_state = (state[0] + movement[0], state[1] + movement[1])
        # Clamp to grid boundaries
        intended_state = (max(0, min(intended_state[0], self.grid_size[0] - 1)),
                          max(0, min(intended_state[1], self.grid_size[1] - 1)))

        # Outcome 1: intended move (with probability 1 - slip_prob)
        if intended_state == self.goal:
            reward_intended = 0
            done_intended = True
        else:
            reward_intended = -1
            done_intended = False
        outcome_intended = {
            'next_state': intended_state,
            'reward': reward_intended,
            'done': done_intended,
            'probability': 1 - self.slip_prob
        }

        # Outcome 2: slip and remain in the same state (with probability slip_prob)
        if state == self.goal:
            reward_slip = 0
            done_slip = True
        else:
            reward_slip = -1
            done_slip = False
        outcome_slip = {
            'next_state': state,
            'reward': reward_slip,
            'done': done_slip,
            'probability': self.slip_prob
        }

        return [outcome_intended, outcome_slip]




    def step_sample(self, action):
        """
        Samples one outcome from the transition distribution for the given action.
        This method updates the environment's state and returns a tuple:
          (next_state, reward, done)
        """
        outcomes = self.get_transition_distribution(self.state, action)
        probs = [outcome['probability'] for outcome in outcomes]
        chosen_index = np.random.choice(len(outcomes), p=probs)
        chosen_outcome = outcomes[chosen_index]
        self.state = chosen_outcome['next_state']
        return chosen_outcome['next_state'], chosen_outcome['reward'], chosen_outcome['done']


    def render(self):
        grid = np.full(self.grid_size, '.')
        grid[self.goal] = 'G'
        grid[self.state] = 'S'
        for row in grid:
            print(" ".join(row))



In [None]:


def main():
    # Create the environment instance
    env = GridworldEnv()

    # 1. Reset the environment
    print("Resetting environment:")
    initial_state = env.reset()
    print("Initial state:", initial_state)

    # 2. Get the transition distribution for a specific action from the initial state
    action = 'right'
    print(f"\nTransition distribution for action '{action}' from state {initial_state}:")
    transitions = env.get_transition_distribution(initial_state, action)
    for outcome in transitions:
        print(outcome)


    # 3. Using the step_sample function (samples one outcome and updates the state)
    print(f"\nUsing step_sample() for action '{action}':")
    next_state, reward, done = env.step_sample(action)
    print("Sampled outcome -> Next state:", next_state, "Reward:", reward, "Done:", done)

    # 4. Render the current state of the gridworld
    print("\nRendering the gridworld:")
    env.render()

if __name__ == "__main__":
    main()


Resetting environment:
Initial state: (0, 0)

Transition distribution for action 'right' from state (0, 0):
{'next_state': (0, 1), 'reward': -1, 'done': False, 'probability': 0.95}
{'next_state': (0, 0), 'reward': -1, 'done': False, 'probability': 0.05}

Using step_sample() for action 'right':
Sampled outcome -> Next state: (0, 1) Reward: -1 Done: False

Rendering the gridworld:
. S . .
. . . .
. . . .
. . . G
