In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

# Task 1: Gridworld Environment


GRID_SIZE = 5
START = (0, 0)
GOAL = (4, 4)

OBSTACLES = {(1,0),(2,0),(3,0)}

ACTIONS = ["UP", "DOWN", "LEFT", "RIGHT"]

# Action → movement
ACTION_DELTA = {
    "UP": (-1, 0),
    "DOWN": (1, 0),
    "LEFT": (0, -1),
    "RIGHT": (0, 1)
}

def is_valid(state):
    """Check if inside grid and not obstacle"""
    r, c = state
    if r < 0 or r >= GRID_SIZE or c < 0 or c >= GRID_SIZE:
        return False
    if (r, c) in OBSTACLES:
        return False
    return True

def step(state, action):
    """
    Apply action to the environment.
    Returns: next_state, reward, done
    """

    r, c = state
    dr, dc = ACTION_DELTA[action]
    new_state = (r + dr, c + dc)


    if not is_valid(new_state):
        # Invalid move (wall or obstacle)
        reward = -10 if new_state in OBSTACLES else -1
        return state, reward, False

    # Valid move
    if new_state == GOAL:
        return new_state, 100, True

    return new_state, -1, False  # normal step


# Task 2: Q-Learning

alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 1000

# Q-table: 5x5 states × 4 actions
Q = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

def epsilon_greedy(state):
    if np.random.rand() < epsilon:
        return np.random.choice(len(ACTIONS))
    else:
        r, c = state
        return np.argmax(Q[r, c])

episode_rewards = []

for ep in range(episodes):

    state = START
    total_reward = 0

    while True:

        a = epsilon_greedy(state)
        action = ACTIONS[a]

        next_state, reward, done = step(state, action)

        r, c = state
        nr, nc = next_state

        # Q-learning update
        Q[r, c, a] += alpha * (reward + gamma * np.max(Q[nr, nc]) - Q[r, c, a])

        total_reward += reward
        state = next_state

        if done:
            break

    episode_rewards.append(total_reward)


# Task 3: Extract Optimal Policy

policy = np.full((GRID_SIZE, GRID_SIZE), " ")

arrow_map = {
    "UP": "↑",
    "DOWN": "↓",
    "LEFT": "←",
    "RIGHT": "→"
}

for r in range(GRID_SIZE):
    for c in range(GRID_SIZE):

        if (r, c) in OBSTACLES:
            policy[r, c] = "X"
        elif (r, c) == GOAL:
            policy[r, c] = "G"
        elif (r, c) == START:
            best_a = np.argmax(Q[r, c])
            policy[r, c] = "S"
        else:
            best_a = np.argmax(Q[r, c])
            policy[r, c] = arrow_map[ACTIONS[best_a]]

display(Markdown("### **Optimal Policy**"))
for row in policy:
    print(" ".join(row))


# Task 4: Plot Episode Rewards

plt.figure(figsize=(10,5))
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Q-Learning: Episode Rewards Over Time")
plt.grid(True)
plt.show()
