# Q Learning
- It assigns Q-values for every action you could possibly take given a state. 
- Overtime, we will update these Q-values in such a way that running through a chain of action will produce a good result by rewarding the agents for the long term goal rather than intermediate actions.
- Model-free learning as it is applicable to any environment as long as the environment is simple enough.

In [2]:
%pip install numpy
%pip install gymnasium
%pip install matplotlib
%pip install pygame

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import gymnasium as gym
import numpy as np

env = gym.make("MountainCar-v0", render_mode="human")
new_state, info = env.reset()
done = False


In [4]:
DISCRETE_OBS_SPACE_SIZE = [20] * len(env.observation_space.low)
DISCRETE_OBS_SPACE_WIN_SIZE = (env.observation_space.high - env.observation_space.low)/DISCRETE_OBS_SPACE_SIZE
LEARNING_RATE = 0.1
DISCOUNT = 0.95 # how much we value future rewards over current reward
EPISODES = 25000 # episode occurs when our process ends; reaches some end state
SHOW_EVERY = 2000
EPSILON = 0.5 # for randomness
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
EPSILON_DECAY_VALUE = EPSILON/(END_EPSILON_DECAYING-START_EPSILON_DECAYING)
q_table = np.random.uniform(low=-2,high=0,size=(DISCRETE_OBS_SPACE_SIZE + [env.action_space.n]))


In [5]:
print(DISCRETE_OBS_SPACE_SIZE)
print(DISCRETE_OBS_SPACE_WIN_SIZE)
print(env.observation_space.low)
print(env.observation_space.high)
print(env.action_space.n)
print(q_table)

[20, 20]
[0.09  0.007]
[-1.2  -0.07]
[0.6  0.07]
3
[[[-0.5100479  -0.20218611 -1.04084102]
  [-1.46933546 -0.14706098 -0.53830649]
  [-0.30963205 -0.00860614 -0.37297258]
  ...
  [-0.52902549 -0.93555592 -0.64082182]
  [-1.5638738  -1.75937503 -0.38843841]
  [-0.06763673 -1.99696178 -1.47865918]]

 [[-1.73323355 -1.37501487 -0.69316305]
  [-0.88193641 -1.30892738 -0.61739989]
  [-0.21601129 -0.51780371 -0.03710019]
  ...
  [-1.68498274 -1.05502746 -0.20513743]
  [-0.55150682 -1.13624278 -0.63302268]
  [-0.44509731 -0.35932756 -0.39179561]]

 [[-1.14987182 -0.62941582 -1.13278414]
  [-1.9829411  -0.83609049 -0.9172671 ]
  [-0.42478045 -0.00425793 -0.56000496]
  ...
  [-0.96813083 -1.93297776 -1.34490131]
  [-0.4495244  -1.20580283 -0.83607005]
  [-0.47501927 -1.95327807 -0.21810359]]

 ...

 [[-1.92649321 -1.10194413 -1.90667181]
  [-0.86869694 -0.31975025 -1.43219085]
  [-1.43009684 -0.75130575 -1.3419274 ]
  ...
  [-0.08112828 -1.7590338  -0.55867778]
  [-0.39563798 -1.83809184 -0.682

In [6]:
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / DISCRETE_OBS_SPACE_WIN_SIZE
    return tuple(discrete_state.astype(np.int32))

discrete_state = get_discrete_state(env.reset()[0])
print(discrete_state)
print(q_table[discrete_state])
print(np.argmax(q_table[discrete_state])) # returns index of value with highest score

(np.int32(7), np.int32(10))
[-0.29109562 -0.6808874  -1.49232351]
0


## Without Q-Learning

In [None]:
done = False
while not done:
    action = 2 # for my info, 0 pushes car left, 1 does nothing, 2 goes right. This enviro is not known by my algo initially
    new_state, reward, done, truncated, info = env.step(action)
    if done or truncated:
        new_state, info = env.reset()
env.close()

KeyboardInterrupt: 

: 

## With Q-Learning

In [1]:
done = False
for episode in range(EPISODES):
    print('here')
    discrete_state = get_discrete_state(env.reset()[0])
    while not done:
        action = np.argmax(q_table[discrete_state])
        new_state, reward, done, truncated, info = env.step(action)
        new_discrete_state = get_discrete_state(new_state)

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state (action,)]
            new_q = (1-LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT*max_future_q) 
            # We only get reward when we win otherwise it is none hence we depend entirely on end results
            q_table[discrete_state + (action,)] = new_q
            
        else:
            q_table[discrete_state + (action,)] = 0
        
        discrete_state = new_discrete_state
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        EPSILON -= EPSILON_DECAY_VALUE

env.close()

NameError: name 'EPISODES' is not defined