<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Initialize parameters
num_states = 5  # Total number of states
num_actions = 2  # Number of possible actions (left or right)
q_table = np.zeros((num_states, num_actions))  # Q-table initialized to zero
learning_rate = 0.1  # How much new information overrides old information
discount_factor = 0.9  # How much future rewards are valued compared to immediate rewards
epsilon = 0.1  # Initial exploration rate

# Dummy environment function
def step(state, action):
    if action == 0:  # Move left
        next_state = max(0, state - 1)
    else:  # Move right
        next_state = min(num_states - 1, state + 1)
    # Reward is +1 for reaching the final state, -1 otherwise
    reward = 1 if next_state == num_states - 1 else -1
    return next_state, reward

# Q-learning training loop
for episode in range(1000):  # Run for 1000 episodes
    state = np.random.randint(0, num_states)  # Start from a random state
    while state != num_states - 1:  # Continue until reaching the final state
        # Choose action based on epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = np.random.randint(0, num_actions)  # Explore: random action
        else:
            action = np.argmax(q_table[state])  # Exploit: best known action

        # Take action and observe result
        next_state, reward = step(state, action)

        # Update Q-value using the Bellman equation
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
                                 learning_rate * (reward + discount_factor * np.max(q_table[next_state]))

        # Transition to the next state
        state = next_state

    # Decay epsilon over time for less exploration and more exploitation
    epsilon = max(0.01, epsilon * 0.99)

    # Log progress periodically
    if episode % 100 == 0:
        print(f"Episode {episode}, Q-table:")
        print(q_table)

# Output the final Q-table
print("Trained Q-table:")
print(q_table)

# Test the optimal policy after training
print("\nTesting optimal policy:")
state = 0
while state != num_states - 1:
    action = np.argmax(q_table[state])  # Choose the best action for the current state
    next_state, reward = step(state, action)
    print(f"State: {state}, Action: {action}, Next State: {next_state}, Reward: {reward}")
    state = next_state