<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade numpy
!pip install --upgrade gym

In [None]:
import gym
import numpy as np

# Initialize the environment
env = gym.make('CartPole-v1')

# Discretization parameters
num_bins = 10  # Number of discrete bins for each state dimension
state_bins = [
    np.linspace(-4.8, 4.8, num_bins),       # Cart position
    np.linspace(-3.0, 3.0, num_bins),       # Cart velocity
    np.linspace(-0.418, 0.418, num_bins),   # Pole angle
    np.linspace(-2.0, 2.0, num_bins)        # Pole angular velocity
]

# Function to discretize continuous states
def discretize_state(state):
    discrete_state = tuple(np.digitize(state[i], state_bins[i]) for i in range(len(state)))
    return discrete_state

# Initialize Q-table
state_space = tuple(num_bins for _ in range(env.observation_space.shape[0]))
action_space = env.action_space.n
q_table = np.zeros(state_space + (action_space,))

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01

# Q-learning training loop
for episode in range(1000):
    obs = env.reset()  # Reset returns a tuple in this version of Gym
    if isinstance(obs, tuple):
        obs = obs[0]  # Extract the actual observation from the tuple

    state = discretize_state(obs)
    done = False
    total_reward = 0

    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        # Perform action and observe result
        result = env.step(action)  # Step returns a tuple
        next_state_raw, reward, terminated, truncated, _ = result
        if isinstance(next_state_raw, tuple):
            next_state_raw = next_state_raw[0]  # Extract observation if needed
        next_state = discretize_state(next_state_raw)
        done = terminated or truncated
        total_reward += reward

        # Q-value update
        best_next_action = np.max(q_table[next_state])
        q_table[state][action] = (1 - learning_rate) * q_table[state][action] + \
                                 learning_rate * (reward + discount_factor * best_next_action)

        state = next_state

    # Update epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

print("Training finished.")
env.close()