# **Suryakanta Karan (M22AIE207) m22aie207@iitj.ac.in**


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define the GridWorld environment
class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.grid = np.zeros((size, size))
        self.grid[4, 4] = 5  # Goal state
        self.grid[1, 3] = -5  # Terminal state
        self.actions = ['N', 'S', 'E', 'W']
        self.q_values = np.zeros((size, size, len(self.actions)))

    def get_reward(self, state):
        if state[0] < 0 or state[0] >= self.size or state[1] < 0 or state[1] >= self.size:
            return -1
        return self.grid[state[0], state[1]]

    def is_terminal(self, state):
        return self.get_reward(state) in [5, -5]

    def step(self, state, action):
        if action == 'N':
            next_state = (state[0] - 1, state[1])
        elif action == 'S':
            next_state = (state[0] + 1, state[1])
        elif action == 'E':
            next_state = (state[0], state[1] + 1)
        elif action == 'W':
            next_state = (state[0], state[1] - 1)
        reward = self.get_reward(next_state)
        if self.is_terminal(next_state) or reward == -1:
            next_state = state
        return next_state, reward

# Initialize GridWorld
env = GridWorld()

# Helper functions
def choose_action(state, q_values, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(env.actions)
    else:
        return env.actions[np.argmax(q_values[state[0], state[1], :])]

def update_q_values(state, action, reward, next_state, q_values, alpha, gamma):
    action_index = env.actions.index(action)
    best_next_action = np.max(q_values[next_state[0], next_state[1], :])
    q_values[state[0], state[1], action_index] += alpha * (reward + gamma * best_next_action - q_values[state[0], state[1], action_index])

# Training the Q-Learning Agent
def train_q_learning(env, alpha=0.1, gamma=0.9, epsilon=0.1, episodes=100000):
    q_values = np.zeros((env.size, env.size, len(env.actions)))
    steps_to_goal = []

    for episode in range(episodes):
        state = (0, 0)  # Starting state
        steps = 0
        while not env.is_terminal(state):
            action = choose_action(state, q_values, epsilon)
            next_state, reward = env.step(state, action)
            update_q_values(state, action, reward, next_state, q_values, alpha, gamma)
            state = next_state
            steps += 1
        steps_to_goal.append(steps)

    return q_values, steps_to_goal

# Plotting Functions
def plot_policy(q_values, env):
    policy = np.zeros((env.size, env.size), dtype=str)
    for i in range(env.size):
        for j in range(env.size):
            best_action = np.argmax(q_values[i, j, :])
            policy[i, j] = env.actions[best_action]

    sns.heatmap(np.max(q_values, axis=2), annot=policy, fmt='', cmap='coolwarm')
    plt.title('Policy')
    plt.show()

def plot_steps(steps, gamma, epsilon):
    plt.figure()
    plt.plot(steps, label=f'Gamma: {gamma}, Epsilon: {epsilon}')
    plt.xscale('log')
    plt.xlabel('Episodes')
    plt.ylabel('Steps to Goal')
    plt.legend()
    plt.show()

# Training with different parameters and plotting results
gammas = [0.1, 0.5, 0.9]
epsilons = [0.1, 0.3, 0.5]

for gamma in gammas:
    q_values, steps = train_q_learning(env, gamma=gamma, epsilon=0.1)
    plot_policy(q_values, env)

    for epsilon in epsilons:
        q_values, steps = train_q_learning(env, gamma=gamma, epsilon=epsilon)
        plot_steps(steps, gamma, epsilon)
