**NAME** : Pratik Vijay Bhagyawant\
**CLASS** : BE-AIDS\
**ROLL No** : 2547008\
**Batch** : A\
**SUB** : Computer Laboratory I (Machine Learning)

**Problem Statement**: Implement Reinforcement Learning using an example of a maze environment that the
agent needs to explore.



In [None]:
import numpy as np
import random

# Test the agent
state = (0, 0)
steps = 0
print("\nOptimal Path:")
while state != (4, 4) and steps < max_steps:
    action = select_action(state)
    next_state = get_next_state(state, action)
    print(f"State: {state}, Action: {action}")
    state = next_state
    steps += 1

if state == (4, 4):
    print(f"Reached the goal in {steps} steps!")
else:
    print("Failed to reach the goal.")

In [None]:
# Maze Environment
maze = [
    [0, 0, 0, -10, 0],
    [0, -10, 0, -10, 0],
    [0, -10, 0, 0, 0],
    [0, 0, 0, -10, 0],
    [0, 0, 0, 0, 100]  # Goal state with reward +100
]

# Parameters
alpha = 0.1      # Learning rate
gamma = 0.9      # Discount factor
epsilon = 1.0    # Exploration rate
epsilon_decay = 0.99
min_epsilon = 0.01
episodes = 500   # Number of training episodes
max_steps = 100  # Max steps per episode

# Q-table initialization (5x5 grid, 4 actions: up, down, left, right)
q_table = np.zeros((5, 5, 4))

# Actions mapping
actions = ['up', 'down', 'left', 'right']
action_dict = {
    'up': (-1, 0),
    'down': (1, 0),
    'left': (0, -1),
    'right': (0, 1)
}


In [None]:
# Helper functions
def get_next_state(state, action):
    """Move to the next state based on the action taken."""
    row, col = state
    row_move, col_move = action_dict[action]
    new_row, new_col = row + row_move, col + col_move
    
    # Boundary check
    if new_row < 0 or new_row >= len(maze) or new_col < 0 or new_col >= len(maze[0]):
        return state  # Can't move outside the maze, return the current state
    return (new_row, new_col)

def get_reward(state):
    """Return the reward for reaching the current state."""
    row, col = state
    return maze[row][col]

def select_action(state):
    """Select an action based on epsilon-greedy policy."""
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Explore
    else:
        return actions[np.argmax(q_table[state[0], state[1]])]  # Exploit

In [None]:
# Training
for episode in range(episodes):
    state = (0, 0)  # Start position
    total_reward = 0
    
    for step in range(max_steps):
        # Choose action and move to the next state
        action = select_action(state)
        next_state = get_next_state(state, action)
        
        # Get reward and update Q-value
        reward = get_reward(next_state)
        total_reward += reward
        action_index = actions.index(action)
        
        q_table[state[0], state[1], action_index] = q_table[state[0], state[1], action_index] + alpha * (
            reward + gamma * np.max(q_table[next_state[0], next_state[1]]) - q_table[state[0], state[1], action_index])
        
        state = next_state
        
        # End episode if goal is reached
        if reward == 100:
            break
    
    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    
    # Optional: print progress
    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")