In [None]:
import gym
import time
import math
import numpy as np
import random
import time


In [None]:
env = gym.make("CartPole-v1", render_mode = "human")

In [None]:
gym.version.VERSION

In [None]:
bucketno = (6, 3)
totalAction = env.action_space.n
lower_bound = [env.observation_space.low[2], -math.radians(50)]
upper_bound = [env.observation_space.high[2], math.radians(50)]
q_table = np.zeros(bucketno + (totalAction,))
gamma = 0.9
max_episode = 10000
solved_time = 199
streak_to_end = 120
no_streak = 0

In [None]:
from typing import Tuple
from sklearn.preprocessing import KBinsDiscretizer

def discretizer(_, __, angle, pole_velocity):
    kbin = KBinsDiscretizer(n_bins=bucketno, encode="ordinal", strategy="uniform")
    kbin.fit([lower_bound, upper_bound])
    return tuple(map(int, (kbin.transform([[angle, pole_velocity]])[0])))

In [None]:
def action_selection(state, epsilon):
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
    return action

In [None]:
def error(new_state, reward, gamma):
    best_q_value = np.max(q_table[new_state])
    error_value = reward + gamma * best_q_value
    return error_value, best_q_value

In [None]:
def exploration_rate(n, min_rate = 0.01):
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1)/25)))

In [None]:
def learning_rate(n, min_rate = 0.1):
    return max(min_rate, min(1.0, 1.0 - math.log10((n+1)/25)))

In [None]:

for v in range(max_episode):
    obs = env.reset()[0]
    time_step = 0
    initial_state = discretizer(*obs)
    current_state = initial_state
    done = False
    while done == False:
        epsilon = exploration_rate(v)
        action = action_selection(current_state, epsilon)
        obs, reward, done, truncated, info= env.step(action)
        new_state = discretizer(*obs)
        error_value, best_q_value = error(new_state, reward, gamma = 1)
        alpha = learning_rate(v)
        old_q_value = q_table[current_state][action]
        q_table[current_state][action] = (1-alpha) * old_q_value + alpha * error_value
        current_state = new_state
        time_step += 1
        env.render()
        print("The episode is {}.".format(v))
        print("Current state is {}".format(current_state))
        print("Action is {}".format(action))
        print("Error value is {}".format(error_value))
        print("Old q value is {}".format(old_q_value))
        print("Best q value is {}".format(best_q_value))
        print("Gamma value is {}", gamma)   
        print("Truncated is {}", truncated)
        

        
    if time_step >= solved_time:
        no_streak += 1
    else:
        no_streak = 0
    if no_streak > streak_to_end:
        print('CartPole problem is solved after {} episodes'.format(v))
        break
env.close()
