<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStardust/blob/main/Implementing_Reinforcement_Learning_with_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Define the environment
grid_size = 5
num_actions = 4  # Up, Down, Left, Right
q_table = np.zeros((grid_size, grid_size, num_actions))

# Parameters
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1  # Exploration rate

# Define actions
actions = {
    0: (-1, 0),  # Up
    1: (1, 0),   # Down
    2: (0, -1),  # Left
    3: (0, 1)    # Right
}

# Define rewards and terminal state
goal_state = (grid_size - 1, grid_size - 1)  # Bottom-right corner
reward_table = np.zeros((grid_size, grid_size))
reward_table[goal_state] = 10  # Reward for reaching the goal

# Q-learning algorithm
for episode in range(1000):
    state = (np.random.randint(0, grid_size), np.random.randint(0, grid_size))  # Random start state

    while True:
        if np.random.rand() < epsilon:
            action = np.random.randint(0, num_actions)  # Explore action space
        else:
            action = np.argmax(q_table[state[0], state[1]])  # Exploit learned values

        # Take action and observe new state and reward
        next_state = (state[0] + actions[action][0], state[1] + actions[action][1])

        # Ensure next_state is within grid bounds
        next_state = (max(0, min(next_state[0], grid_size - 1)), max(0, min(next_state[1], grid_size - 1)))

        reward = reward_table[next_state]

        # Update Q-value using the Q-learning formula
        q_table[state[0], state[1], action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state[0], next_state[1]]) - q_table[state[0], state[1], action])

        state = next_state

        if state == goal_state:  # Define condition for episode termination
            break

# Print the final Q-table
print("Final Q-table:")
print(q_table)