# Implementation of Q learning in Python

In [None]:
import gym 
import numpy as np

In [None]:
class QLearningAgent:
    def __init__(self, num_state_bins, num_actions, state_bins, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.num_state_bins = num_state_bins  # Number of bins for state discretization
        self.num_actions = num_actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.state_bins = state_bins  # Store state_bins as an attribute
        self.q_table = np.zeros(tuple(self.num_state_bins) + (num_actions,))

    def discretize_state(self, state):
        """
        Discretize the continuous state into discrete bins.
        """
        state_bins = []
        for i in range(len(state)):  # Iterate over each of the state values
            bin_idx = int(np.digitize(state[i], self.state_bins[i])) - 1  # Get the index of the bin
            state_bins.append(bin_idx)  # Append the bin index to the list
        return tuple(state_bins)

    def choose_action(self, state):
        """
        Choose an action using epsilon-greedy policy.
        """
        if np.random.uniform(0, 1) < self.epsilon:  # Explore
            return np.random.randint(0, self.num_actions)
        else:  # Exploit
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        """
        Update the Q-table using the Q-learning update rule.
        """
        best_next_action = np.argmax(self.q_table[next_state])  # Max Q value for next state
        td_target = reward + self.gamma * self.q_table[next_state + (best_next_action,)]
        td_error = td_target - self.q_table[state + (action,)]
        self.q_table[state + (action,)] += self.alpha * td_error



In [None]:
# Create the CartPole environment
env = gym.make('CartPole-v1')

In [None]:
# Define the state space discretization
num_state_bins = [6, 6, 12, 12]  # Number of bins for each of the four state variables (position, velocity, angle, angular velocity)
state_bins = [
    np.linspace(-2.4, 2.4, num_state_bins[0] + 1),  # Position
    np.linspace(-3.0, 3.0, num_state_bins[1] + 1),  # Velocity
    np.linspace(-0.5, 0.5, num_state_bins[2] + 1),  # Angle
    np.linspace(-2.0, 2.0, num_state_bins[3] + 1)   # Angular velocity
]

In [None]:
# Initialize the Q-learning agent
num_actions = env.action_space.n  # Number of possible actions (0 or 1)
q_learning_agent = QLearningAgent(num_state_bins, num_actions, state_bins)

In [None]:
# Training the Q-learning agent
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    state = q_learning_agent.discretize_state(state)  # Discretize the initial state
    done = False
    total_reward = 0
    
    while not done:
        action = q_learning_agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = q_learning_agent.discretize_state(next_state)  # Discretize next state
        q_learning_agent.update_q_table(state, action, reward, next_state)
        total_reward += reward
        state = next_state
        
    if (episode + 1) % 50 == 0:
        print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

env.close()