<a href="https://colab.research.google.com/github/ayush-09/Autonomous-Taxi-Agent/blob/master/AutonomousTaxiRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import gym
import random
from IPython.display import clear_output

In [None]:
env = gym.make("Taxi-v3")

In [None]:
episodes = 10

for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        score += reward
        clear_output(wait=True)
    print('Episode: {}\nScore: {}'.format(episode, score))
    
env.close()

Episode: 9
Score: -839


In [None]:
#Creating Q-Table
actions = env.action_space.n
state = env.observation_space.n

q_table = np.zeros((state, actions))

In [None]:
q_table.shape

(500, 6)

In [None]:
#Parameters for Q-Learning
num_episodes = 100000
max_steps_per_episode = 1000

learning_rate = 0.01
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []


In [None]:
#Q-Learning Algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        #Exploration vs Exploitation trade-off
        exploration_threshold = random.uniform(0, 1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        #Update Q-Table
        q_table[state, action] = q_table[state, action] * (1-learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        
        rewards_current_episode += reward
        
        if done == True:
            break
            
    exploration_rate = min_exploration_rate + \
                        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    rewards_all_episodes.append(rewards_current_episode)
    
print("******* Training Finished *******")

******* Training Finished *******


In [None]:
q_table

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.13756027e+00, -1.77793802e+00, -2.35126344e+00,
        -1.89595480e+00,  9.62206970e+00, -6.34076025e+00],
       [ 5.65572060e-02,  4.08953681e-02, -7.78018420e-01,
         7.49836447e-01,  1.41188060e+01, -4.17070255e+00],
       ...,
       [-7.75673131e-01,  1.16089660e-02, -7.84419637e-01,
        -8.02865261e-01, -2.20794936e+00, -2.89553932e+00],
       [-2.14333214e+00, -2.14113662e+00, -2.14140305e+00,
        -4.46281925e-01, -4.81628309e+00, -4.32283739e+00],
       [-5.87597209e-02, -6.01628985e-02, -4.65099500e-02,
         8.74218689e+00, -2.90487248e-01, -1.00000000e-01]])

In [None]:
#Calculate and print average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000

print("Average per thousand episodes")
for r in rewards_per_thousand_episodes:
    print(count,  " : ", str(sum(r/1000)))
    count+= 1000

Average per thousand episodes
1000  :  -538.0270000000003
2000  :  -260.48699999999957
3000  :  -145.1780000000005
4000  :  -86.264
5000  :  -50.63699999999998
6000  :  -29.22999999999994
7000  :  -16.054000000000013
8000  :  -7.851000000000003
9000  :  -2.7350000000000145
10000  :  0.2650000000000015
11000  :  2.883999999999986
12000  :  4.447999999999978
13000  :  5.579999999999978
14000  :  5.973999999999975
15000  :  6.920999999999967
16000  :  6.874999999999976
17000  :  7.1059999999999715
18000  :  7.135999999999974
19000  :  7.275999999999974
20000  :  7.160999999999962
21000  :  7.197999999999971
22000  :  7.237999999999969
23000  :  7.339999999999967
24000  :  7.334999999999965
25000  :  7.414999999999972
26000  :  7.5369999999999635
27000  :  7.521999999999955
28000  :  7.2849999999999655
29000  :  7.509999999999965
30000  :  7.406999999999966
31000  :  7.353999999999957
32000  :  7.467999999999964
33000  :  7.411999999999964
34000  :  7.1969999999999645
35000  :  7.466999999

In [None]:
#Visualize Agent
import time

for episode in range(3):
    state = env.reset()
    done = False
    print("Episode is: " + str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.4)
        
        action = np.argmax(q_table[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("*****Reached Goal*****")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("*****Failed*****")
                time.sleep(2)
                clear_output(wait=True)
                
            break
            
        state = new_state
        
env.close()

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
*****Failed*****
