<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# GridWorld Environment
class GridWorld:
    def __init__(self, size, start, goal, obstacles):
        self.size = size
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.state = start

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        if self.state == self.goal:
            return self.state, 0, True
        x, y = self.state
        if action == 0:  # up
            next_state = (x, y - 1)
        elif action == 1:  # right
            next_state = (x + 1, y)
        elif action == 2:  # down
            next_state = (x, y + 1)
        else:  # left
            next_state = (x - 1, y)
        if (
            next_state in self.obstacles
            or next_state[0] < 0
            or next_state[1] < 0
            or next_state[0] >= self.size[0]
            or next_state[1] >= self.size[1]
        ):
            next_state = self.state
        reward = -1 if next_state != self.goal else 10
        self.state = next_state
        return next_state, reward, next_state == self.goal

# Q-learning Algorithm
class QLearning:
    def __init__(self, env, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.995):
        self.env = env
        self.q_table = np.zeros(env.size + (4,))  # (grid_size[0], grid_size[1], 4 actions)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(4)  # Explore: choose a random action
        else:
            return np.argmax(self.q_table[state])  # Exploit: choose the best action

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error

    def train(self, episodes):
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            steps = 0
            total_reward = 0
            while not done:
                action = self.choose_action(state)
                next_state, reward, done = self.env.step(action)
                self.update(state, action, reward, next_state)
                state = next_state
                steps += 1
                total_reward += reward
            # Decay epsilon
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
            # Log progress
            print(f"Episode {episode+1}: Steps = {steps}, Total Reward = {total_reward}")

    def test_agent(self):
        state = self.env.reset()
        done = False
        path = [state]
        while not done:
            action = np.argmax(self.q_table[state])
            state, _, done = self.env.step(action)
            path.append(state)
        return path

    def visualize_q_table(self):
        max_q_values = np.max(self.q_table, axis=2)
        plt.imshow(max_q_values, cmap='viridis')
        plt.colorbar()
        plt.title("State-Action Values (Max Q)")
        plt.show()

# Example usage
env = GridWorld(
    size=(5, 5),
    start=(0, 0),
    goal=(4, 4),
    obstacles=[(1, 1), (2, 2), (3, 3)]
)
agent = QLearning(env)
agent.train(episodes=500)

# Test the trained agent
path = agent.test_agent()
print(f"Path to goal: {path}")

# Visualize the Q-table
agent.visualize_q_table()