In [1]:
!pip install numpy
!pip install gym



In [2]:
import numpy as np
import gym
import random

In [3]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



In [4]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")
action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  500  possible states
There are  6  possible actions


In [5]:
# Create our Q table with state_size rows and action_size columns (500x6)
Q = np.zeros((state_space, action_space))
print(Q)
print(Q.shape)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
(500, 6)


In [6]:
# Defining Hyperparameters

total_episodes = 25000        # Total number of training episodes
total_test_episodes = 100     # Total number of test episodes
max_steps = 100               # Max steps per episode

learning_rate = 0.01          # Learning rate
gamma = 0.96               # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [7]:
# Defining the epsilon greedy policy

def epsilon_greedy_policy(Q, state):
  # if random number > greater than epsilon --> exploitation
  if(random.uniform(0,1) > epsilon):
    action = np.argmax(Q[state])
  # else --> exploration
  else:
    action = env.action_space.sample()
  
  return action

In [8]:
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    
    for step in range(max_steps):
        #
        action = epsilon_greedy_policy(Q, state)

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        Q[state][action] = Q[state][action] + learning_rate * (reward + gamma * 
                                    np.max(Q[new_state]) - Q[state][action])      
        # If done : finish episode
        if done == True: 
            break
        
        # Our new state is state
        state = new_state

In [9]:
print("New Q-values:")
print(Q)

New Q-values:
[[ 0.          0.          0.          0.          0.          0.        ]
 [-3.15018243 -3.05554835 -3.27905442 -3.15342186  6.15923344 -3.36818209]
 [-1.63158704 -1.59321463 -1.83944014 -1.52546157 11.69161239 -1.84207491]
 ...
 [-0.78739301 -0.57387634 -0.78749045 -0.78009606 -0.7979232  -0.79743124]
 [-2.19750963 -2.19469207 -2.19763657 -1.99813212 -2.28713268 -2.28458102]
 [-0.019996   -0.0202819  -0.019996    1.49465593 -0.1        -0.1       ]]


In [12]:
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        env.render()     
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state

env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43

In [None]:
# ###### FOR WATCHING THE AGENT PLAY ######

# import time
# from IPython.display import clear_output
# rewards = []

# for episode in range(total_test_episodes):
#     state = env.reset()
#     step = 0
#     done = False
#     total_rewards = 0
#     print("****************************************************")
#     print("EPISODE ", episode)
#     time.sleep(0.5)
#     for step in range(max_steps):
#         clear_output(wait=True)
#         env.render()     
#         time.sleep(0.5)
#         # Take the action (index) that have the maximum expected future reward given that state
#         action = np.argmax(Q[state][:])
#         new_state, reward, done, info = env.step(action)
#         total_rewards += reward
        
#         if done:
#             clear_output(wait=True)
#             env.render()
#             if(reward==20):
#                 print("*****You successfully dropped off the passenger!*****")
#                 time.sleep(0.5)
#             else:
#                 print("*****You're not a good driver!*****")
#                 time.sleep(0.5)
#             rewards.append(total_rewards)
#             clear_output(wait=True)
#             print ("Score", total_rewards)
#             break
#         state = new_state

# env.close()
# print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
