##  *Develop an agent that can achieve a specific goal and sub-goal in a maze 10X10 based on RL algorithm (Q-learning).*

In [234]:
# Import the necessary libraries to create the maze and give our agent the ability to randomly making choices

import numpy as np
import random

####  Setting up the environment

In [235]:
# Implemetation of the maze 10X10 as indicated in the assignment. 

# 1 represents the walls (unpassable)
# 0 represents an open path (passable)
# S marks the starting point of the agent
# G marks the sub-goal the agent should reach before continuing
# E marks the end goal

maze = np.array([[1,1,1,1,1,1,1,1,1,1],
                 [1,'S',1,0,0,0,1,0,0,1],
                 [1,0,1,0,1,0,1,0,1,1],
                 [1,0,0,0,1,0,0,0,0,1],
                 [1,1,1,0,1,1,1,1,0,1],
                 [1,0,1,'G',0,0,0,1,0,1],
                 [1,0,0,0,1,1,0,0,0,1],
                 [1,1,1,0,1,0,1,1,0,1],
                 [1,0,0,0,0,0,1,'E',0,1],
                 [1,1,1,1,1,1,1,1,1,1]], dtype=object)



# Implementatio of the Q-Table
# Considering the maze above, and the Q-learning algorithm, we need to initiate
# the Q-table. The Q-table include all possible states-actions pairs and their corresponing Q-values.

q_table = np.zeros((10,10,4))   # each states have 4 possible actions (up, down, left, right)
actions = ['up', 'down', 'left', 'right']

# Implementation of the agent's starting position in the maze, the sub-goal and the end-goal

start_position = (1,1)
sub_goal = (5,3)
end_goal = (8,7)


# Implementation of the reward system

reward_sub_goal = 10
reward_end_goal = 100
penalty = -1    

# Implementation of the agent's interactions with the environment, the way the agent moves in the maze
# and the way penalties and rewards are given to the agent.

def moves(state, actions):
    row, col = state

    if actions == 'up':
        next_state = (row - 1, col)
    elif actions == 'down':
        next_state = (row + 1, col)
    elif actions == 'left':
        next_state = (row, col - 1)
    elif actions == 'right':
        next_state = (row, col + 1)
    else:
        next_state = state

# Check if the next state is a wall or out of the maze
    if 0 <= next_state[0] < 10 and 0 <= next_state[1] < 10:
        if maze[next_state] != 1:
            if next_state == sub_goal:
                return next_state, reward_sub_goal, False   #Sub-goal reached 
            elif next_state == end_goal:
                return next_state, reward_end_goal, True    #End-goal reached
            else:
                return next_state, penalty, False           #No goal reached, keep searching
        else:
            return state, penalty, False                    #Hit a wall, stay in the same state
    else:
        return state, penalty, False                        #Out of the maze, stay in the same state



# Implementation of the starting funtion, in order to restart the agent's position in the maze
# at the end of each episode.

def restarting():
    return start_position


# Implementation of the selection of the agent's action.
# The agent will randomly choose the actions to take in the Q-table, based on the epsilon-greedy policy.
# The action with the highest Q-value will be chosen 

def select_action(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return np.random.randint(len(actions))
    else:
        return np.argmax(q_table[state[0], state[1]])

#### Hyperparameters

In [236]:
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Initial exploration rate. 
epsilon_variation = 0.99  # This parameter will decrease the epsilon value over time, in order to make the agent more greedy
min_epsilon = 0.01  # Minimum exploration rate reached by the agent
episodes = 1000 # Number of episodes the agent will run
steps = 100  # Number of steps the agent will take in each episode

#### Implementation of the Q-learning algorithm

In [237]:
# Implementation of the Q-learning algorithm based on the mathematical formula
# Q(s,a) = Q(s,a) + alpha * (reward + gamma * max(Q(s',a')) - Q(s,a))

def q_value(state, action_index, reward, next_state):
    
    max_q_value = np.max(q_table[next_state[0], next_state[1]])
    actual_q_value = q_table[state[0], state[1], action_index]
    new_q_value = actual_q_value + alpha * (reward + gamma * max_q_value - actual_q_value)

    q_table[state[0], state[1], action_index] = new_q_value

#### Implementation of the training process

In [238]:
for episode in range(episodes):
    state = restarting()
    done = False
    total_reward = 0

    #if episode < 5:  # Print debug information for the first few episodes
    #    print(f"\nStarting Episode {episode + 1}")

    for step in range(steps):
        action_index = select_action(state, epsilon)
        action = actions[action_index]

        # Debug: Print selected action and state
    #    if episode < 5:
    #        print(f"Step {step + 1}: State: {state}, Action: {action}")

        next_state, reward, done = moves(state, action)

        # Debug: Print next state, reward, and done flag
    #    if episode < 5:
    #        print(f"Next State: {next_state}, Reward: {reward}, Done: {done}")

        # Update Q-value
        max_future_q = np.max(q_table[next_state[0], next_state[1]])
        current_q = q_table[state[0], state[1], action_index]
        q_table[state[0], state[1], action_index] = current_q + alpha * (reward + gamma * max_future_q - current_q)

        state = next_state
        total_reward += reward

        if done:
    #        if episode < 5:
    #            print(f"Episode {episode + 1} completed after {step + 1} steps with total reward {total_reward}")
           break

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_variation)

    # Print progress every 100 episodes
    if (episode + 1) % 100 == 0:
        print(f'Episode {episode + 1}/{episodes}, Total reward: {total_reward}')


Episode 100/1000, Total reward: 307
Episode 200/1000, Total reward: 373
Episode 300/1000, Total reward: 417
Episode 400/1000, Total reward: 417
Episode 500/1000, Total reward: 428
Episode 600/1000, Total reward: 417
Episode 700/1000, Total reward: 428
Episode 800/1000, Total reward: 428
Episode 900/1000, Total reward: 428
Episode 1000/1000, Total reward: 428
