In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

In [2]:
env = gym.make('Taxi-v3')

In [3]:
episodes = 10

for episode in range(episodes):
    state = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        state,reward,done,info = env.step(env.action_space.sample())
        score += reward
        clear_output(wait = True)
    print('Episode: {} \nScore: {}'.format(episode+1,score))
env.close()

Episode: 10 
Score: -767


In [4]:
num_actions = env.action_space.n
num_states = env.observation_space.n

In [5]:
q_table = np.zeros((num_states,num_actions))

In [6]:
q_table.shape

(500, 6)

In [7]:
"""Hyperparameters"""
num_episodes = 20000
max_steps_per_episode = 200
alpha = 0.5
gamma = 0.999
epsilon = 1
max_eps =1
min_eps = 0.01
eps_decay_rate = 0.001
rewards_all_episodes = []

In [8]:
"""Epsilon Greedy Action Selection"""
def policy(state,q_table,epsilon):
    if np.random.random()< epsilon:
        action = np.random.randint(num_actions)
    else:
        av = q_table[state,:]
        action = np.argmax(av)
    return action

In [9]:
def decrement_epsilon(epsilon,max_eps,min_eps,eps_decay_rate,episode):
    return min_eps+(max_eps- min_eps)*np.exp(-eps_decay_rate*episode)

In [10]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        action = policy(state,q_table,epsilon)
        new_state,reward,done,info = env.step(action)
        qsa = q_table[state,action]
        next_qsa = np.max(q_table[new_state,:])
        q_table[state,action]= qsa + alpha*(reward+gamma*next_qsa - qsa)
        state = new_state
        rewards_current_episode+=reward
        if done == True:
            break
    epsilon = decrement_epsilon(epsilon,max_eps,min_eps,eps_decay_rate,episode)
    rewards_all_episodes.append(rewards_current_episode)
print("Training finished")
     

Training finished


In [11]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 8.78615276,  9.84577592,  8.83345556,  9.84577693, 10.85663445,
         0.84577733],
       [12.87767232, 13.89523652, 12.88137326, 13.89527757, 14.91018981,
         4.89526765],
       ...,
       [13.71145349, 15.92611592, 12.42125307, 13.68895017,  5.48297641,
         5.72298373],
       [ 2.50284197, 11.063568  , 10.2712063 , 11.86836647, -1.7264825 ,
        -6.71467865],
       [17.93834601, 16.93637876, 17.95572818, 18.98      ,  8.96023859,
         8.96078818]])

In [12]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000

print("Average per thousand episodes")
for r in rewards_per_thousand_episodes:
    print(count,":", str(sum(r/1000)))
    count+=1000


Average per thousand episodes
1000 : -221.26699999999968
2000 : -8.821999999999994
3000 : 2.8609999999999913
4000 : 5.861999999999972
5000 : 6.735999999999974
6000 : 7.073999999999968
7000 : 7.451999999999958
8000 : 7.325999999999959
9000 : 7.3759999999999675
10000 : 7.501999999999967
11000 : 7.54199999999996
12000 : 7.2239999999999585
13000 : 7.371999999999964
14000 : 7.530999999999979
15000 : 7.391999999999966
16000 : 7.442999999999971
17000 : 7.382999999999967
18000 : 7.323999999999966
19000 : 7.574999999999962
20000 : 7.390999999999972


In [None]:
for episode in range(5):
    state= env.reset()
    done = False
    print("Episode is:" + str(episode))
    time.sleep(1)
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.4)
        action = np.argmax([q_table[state,:]])
        new_state,reward,done,info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 20:
                print("Reached the Goal State")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("Failed")
                time.sleep(2)
                clear_output(wait=True)
            break
        state = new_state
env.close()

Episode is:4
