[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nihalgeorge01/DLSS_RL/blob/main/DLSS_RL_Q_Learning_Code.ipynb)

In [None]:
import gym
import numpy as np  
# Create an environment of Taxi-v3:
env = gym.make('Taxi-v3').env 
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m:[43m [0m|
+---------+

491
Discrete(500)


In [None]:
state = env.reset()
epochs = 0
penalty, reward = 0, 0  # records the number of times the agent hits a wall
frames = []
done = False # tracks whether episode is finished or not
while not done:
    action = env.action_space.sample()
    new_state, reward, done, info = env.step(action)
    if new_state == state:
      penalty+=1
    state = new_state

    frames.append({'frame': env.render(mode='ansi'), 'state': state, 'action': action, 'reward': reward})
    epochs += 1
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalty))

Timesteps taken: 4304
Penalties incurred: 2381


In [None]:
from IPython.display import clear_output
from time import sleep
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait = True)
        print(frame['frame'])
        print(f"Timestep: {i+1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(1)
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (East)

Timestep: 29
State: 349
Action: 2
Reward: -1


KeyboardInterrupt: ignored

# Q Learning

In [None]:
import numpy as np
import random
q_table = np.zeros([env.observation_space.n, env.action_space.n])
(alpha, gamma, episodes, epsilon) = (0.6, 0.9, 1000, 0.4)
total_epochs =0
total_penalty = 0
illegal_episode =[]
for i in range(episodes):
  state = env.reset()
  epochs = 0
  penalty, reward_tot = 0, 0  
  illegal =0
  
  done = False 
  while not done:
    rand = random.uniform(0,1)
    if rand < epsilon:
      action = env.action_space.sample()
    if rand >= epsilon:
      action = np.argmax(q_table[state])

    new_state, reward, done, info = env.step(action)
    if new_state == state:
      penalty+=1

    if reward == -10:
      illegal+=1

    epochs += 1

    #Q_cal = Reward + gamma*max{a}(Q(s',.))
    #new_q = oldq + aplha(Q_cal - oldq) = (1-alpha)oldq + alpha(Q_cal)
          

    oldq = q_table[state, action]
    new_state_max = np.max(q_table[new_state]) 
    newq = (1 - alpha) * oldq + alpha * (reward + gamma * new_state_max)
    q_table[state, action] = newq
    
    state = new_state
    epochs+=1
  illegal_episode.append(illegal)
  total_penalty+=penalty
  total_epochs+=epochs

total_penalty/=episodes
total_epochs/=episodes

In [None]:
state = env.reset()
epochs = 0
penalty, reward = 0, 0  
frames = []
done = False
while not done:
    action = np.argmax(q_table[state,:])
    new_state, reward, done, info = env.step(action)
    if new_state == state:
      penalty+=1
    state = new_state

    frames.append({'frame': env.render(mode='ansi'), 'state': state, 'action': action, 'reward': reward})
    epochs += 1
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalty))

Timesteps taken: 12
Penalties incurred: 0


In [None]:
print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 12
State: 85
Action: 5
Reward: 20
