In [None]:
import numpy as np
import gym #Open AI library
import time
import math

In [None]:
env = gym.make("CartPole-v1", render_mode = "human")
print(env.action_space.n)

In [None]:
lr = 0.1 #Learning Rate
gamma = 0.95 #Discount Factor

epochs = 60000 #How many iterations
total_time = 0
total_reward = 0
prev_reward = 0

Observation = [30,30,50,50]
step_size = np.array([.25,.25, .01, .01])

epsilon = 1
epsilon_decay_value = 0.9995

In [None]:
q_table = np.random.uniform(low=0, high=1, size=(Observation+[env.action_space.n])) #Randomly initializing Q-Table

In [None]:
#Method used to discretize the state space given by OpanAI Gym library
def discrete_state(state):
    aux = state/step_size + np.array([15,12,1,10])
    return tuple(aux.astype(int))

In [None]:
for epoch in range(epochs+1):
    t_initial = time.time()

    discrete_state_var = discrete_state(env.reset()[0]) #Get the discrete state for the restarted environment, so we know what's going on

    done = False #Control boolean

    epoch_reward = 0

    #print("Starting...")

    if epoch % 1000 == 0: #I'm going to print every each 1000 epochs
        print("Episode: " + str(epoch))

    while not done:
        if np.random.random() > epsilon: #If some random number is greater than epsilon
            action = np.argmax(q_table[discrete_state_var]) #Look into the Q-Table for the action that maximizes the reward for the actual state (Exploitation)
        else:
            action = np.random.randint(0, env.action_space.n) #Pick a random action from the action space (Exploration)
            print("Action picked randomly")
    
        print("Action: " + str(action))
        new_state, reward, terminated, truncated, done = env.step(action) #Update the environment

        epoch_reward += reward
        
        print("Q Table: " + str(q_table))
        print("New State: " + str(new_state))
        new_discrete_state = discrete_state(new_state)
        print("New Discrete State: " + str(new_discrete_state))

        if epoch % 1000 == 0: #I'm rendering the environment every each 1000 epochs
            env.render()
        
        if not done: #If the game is not over, update the Q-Table
            max_new_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state_var + (action,)]
            new_q = current_q + lr*(reward + (gamma*max_new_q) - current_q)
            q_table[discrete_state_var + (action,)] = new_q

        discrete_state_var = new_discrete_state #Updating the state

        if epsilon > 0.05:
            if epoch_reward > prev_reward and epoch > 10000:
                epsilon = math.pow(epsilon_decay_value, epoch-10000)
            if epoch % 500 == 0:
                print("Epsilon: " + str(epsilon))

        #Calculating total times
        tfinal = time.time()
        episode_total_time = tfinal - t_initial
        total_time += episode_total_time

        #Calculating total rewards
        total_reward += epoch_reward
        prev_reward = epoch_reward

        if epoch % 1000 == 0:
            mean_time = total_time/1000
            print("Average Time: " + str(mean_time))
            total_time = 0
            mean_reward = total_reward/1000
            print("Average Reward: " + str(mean_reward))
            total_reward = 0

env.close()