**Aim:**
*To implement the Python program to Implementing a simple grid-world environmentand training an agent using basic Q learning*

In [None]:
import numpy as np

class GridWorld:
    def __init__(self, rows, cols, start, goal, obstacles):
        self.rows = rows
        self.cols = cols
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.state = start
        self.is_terminal = False

    def reset(self):
        self.state = self.start
        self.is_terminal = False

    def step(self, action):
        next_state = tuple(np.array(self.state) + np.array(action))

        if next_state == self.goal:
            reward = 1
            self.is_terminal = True
        elif next_state in self.obstacles or not (0 <= next_state[0] < self.rows) or not (0 <= next_state[1] < self.cols):
            reward = -1
            self.is_terminal = False
        else:
            reward = 0
            self.state = next_state
            self.is_terminal = False

        return next_state, reward, self.is_terminal

In [None]:
class QLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.actions = actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_values = {}

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            action_index = np.random.choice(len(self.actions))
            return self.actions[action_index]
        else:
            q_vals = [self.get_q_value((state, a)) for a in self.actions]
            return self.actions[np.argmax(q_vals)]

    def get_q_value(self, sa_pair):
        return self.q_values.get(sa_pair, 0)

    def update_q_value(self, sa_pair, new_q_value):
        self.q_values[sa_pair] = new_q_value

    def print_q_values(self):
        print("Q-values Table:")
        for state_action, value in self.q_values.items():
            state, action = state_action
            print(f"State: {state}, Action: {action}, Q-value: {value}")

In [None]:
def train_q_learning(agent, environment, episodes):
    for episode in range(episodes):
        environment.reset()
        state = environment.state

        while not environment.is_terminal:
            action = agent.choose_action(state)
            next_state, reward, is_terminal = environment.step(action)

            best_next_action = max([(agent.get_q_value((next_state, a)), a) for a in agent.actions])[1]
            new_q_value = (1 - agent.alpha) * agent.get_q_value((state, action)) + agent.alpha * (reward + agent.gamma * agent.get_q_value((next_state, best_next_action)))
            agent.update_q_value((state, action), new_q_value)

            state = next_state


In [None]:
if __name__ == "__main__":
    rows, cols = 4, 4
    start = (0, 0)
    goal = (3, 3)
    obstacles = [(1, 1), (2, 1), (2, 2)]

    environment = GridWorld(rows, cols, start, goal, obstacles)
    actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up

    agent = QLearningAgent(actions)

    episodes = 1000
    train_q_learning(agent, environment, episodes)

    # Print the learned Q-values
    agent.print_q_values()
    # print(sa_pair)

    # Test the trained agent
    environment.reset()
    state = environment.state
    steps = 0

    while not environment.is_terminal and steps < 20:
        action = agent.choose_action(state)
        next_state, _, _ = environment.step(action)
        state = next_state
        steps += 1

    print(f"Agent reached the goal in {steps} steps.")

Q-values Table:
State: (0, 0), Action: (0, 1), Q-value: 0.5904899999999987
State: (0, 1), Action: (0, 1), Q-value: 0.6560999999999991
State: (0, 2), Action: (0, 1), Q-value: 0.7289999999999993
State: (0, 3), Action: (-1, 0), Q-value: -0.5239723697585628
State: (-1, 3), Action: (0, 1), Q-value: -0.1
State: (0, 4), Action: (0, 1), Q-value: -0.1
State: (0, 4), Action: (0, -1), Q-value: 0.5788282442128364
State: (0, 3), Action: (0, 1), Q-value: -0.6004052720784738
State: (0, 3), Action: (0, -1), Q-value: 0.6002912982290319
State: (-1, 3), Action: (0, -1), Q-value: 0.6085684006093857
State: (0, 2), Action: (1, 0), Q-value: 0.557294410676785
State: (1, 2), Action: (0, 1), Q-value: 0.1538999999999999
State: (1, 3), Action: (0, 1), Q-value: -0.6712545367591679
State: (1, 4), Action: (0, 1), Q-value: -0.18061532878833764
State: (1, 4), Action: (0, -1), Q-value: 0.48213537995833816
State: (1, 3), Action: (0, -1), Q-value: 0.5472295540679285
State: (1, 2), Action: (0, -1), Q-value: -0.68439489287