In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+



In [5]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [6]:
#showing action and state space sizes
print("the action size, (number of columns) is:", action_size)
print("the state size, (number of rows) is:", state_size)

the action size, (number of columns) is: 6
the state size, (number of rows) is: 500


In [7]:
#Creating the Qtable
Qtable = np.zeros((state_size,action_size))
print(Qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [8]:
number_episodes = 50000
test_episodes = 100 
max_moves = 99 #max amount of moves allowed
learning_rate = 0.7 #how fast our agent learns
gamma = 0.618 #this value represents how much our agent values future rewards (direct relationship)

#Exlopration vs. Exploitation
epsilon = 1.0 #Very expolorative initially (we know nothing at the beginning)
max_epsilon = 1.0 
min_epsilon = 0.01
decay_rate = 0.01 #decrease exploration rate to allow for exploitation of knowledge

In [9]:
#Let's do some Q learning!

for episode in range(number_episodes):
    state= env.reset()
    step = 0
    done = False
    
    for step in range(max_moves):
        eVe = random.uniform(0,1) #exploit vs explore
        
        if eVe > epsilon: #exploit
            action = np.argmax(Qtable[state,:]) 
            
        else: #explore
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action) #set values based on action taken
        Qtable[state,action]= Qtable[state,action] + learning_rate*(reward+gamma*np.max(Qtable[new_state,:])-Qtable[state,action])
        state = new_state
        
        if done:
            break
        
        epsilon = min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*episode)

In [10]:
#Let's play the game!

env.reset()
rewards = []

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("************************")
    print("Episode:", episode)
    
    for step in range(max_moves):
        action = np.argmax(Qtable[state,:])
        new_state, reward, done, info = env.step(action)
        total_rewards+=reward
        
        if done:
            rewards.append(total_rewards)
            print("Score", total_rewards)
            break
        
        state = new_state
env.close()
print("************************")
print("Score over time: " + str(sum(rewards)/test_episodes))

************************
Episode: 0
Score 7
************************
Episode: 1
Score 13
************************
Episode: 2
Score 7
************************
Episode: 3
Score 10
************************
Episode: 4
Score 12
************************
Episode: 5
Score 12
************************
Episode: 6
Score 9
************************
Episode: 7
Score 9
************************
Episode: 8
Score 8
************************
Episode: 9
Score 11
************************
Episode: 10
Score 7
************************
Episode: 11
Score 7
************************
Episode: 12
Score 7
************************
Episode: 13
Score 4
************************
Episode: 14
Score 12
************************
Episode: 15
Score 9
************************
Episode: 16
Score 11
************************
Episode: 17
Score 8
************************
Episode: 18
Score 8
************************
Episode: 19
Score 8
************************
Episode: 20
Score 8
************************
Episode: 21
Score 6
*************