## Improving the Q algorithm using SARSA.

In [1]:
import gym
import numpy as np
import time, pickle, os

In [2]:
env = gym.make('Taxi-v3')

In [3]:
alpha = 0.81 # 0.618
gamma = 0.96

In [4]:
state = env.reset()

## Q-Learning

In [5]:
Q = np.zeros((env.observation_space.n, env.action_space.n))

In [6]:
total_episodes = 5000
G = 0
alpha = 0.618
max_steps = 100
gamma = 0.96

In [7]:
for episode in range(1,total_episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    firstState = state
    noSteps = 0
    #print("Initial State = {}".format(state))
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) 
        G += reward
        state = state2
        noSteps = noSteps + 1
        if(noSteps >= max_steps):
          done = True 

In [8]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-5.86405189, -5.562     , -5.47108618, -5.562     , 11.        ,
        -6.18      ],
       [-4.07485821, -4.326     , -4.70289704, -4.326     , 15.        ,
        -6.18      ],
       ...,
       [-3.09      , 16.        , -3.09      , -3.03736046, -6.18      ,
        -6.18      ],
       [-4.944     , -4.90097874, -4.944     , -4.5768541 , -6.18      ,
        -6.18      ],
       [-1.236     , -1.236     , -1.236     ,  6.784404  , -6.18      ,
        -6.18      ]])

In [9]:
state = env.reset()
done = None

In [10]:
counter = 0
while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    counter += 1
    env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|

In [11]:
print("No of steps: ", counter)
print("Total Reward: ", reward)

No of steps:  11
Total Reward:  20


## SARSA

Epsilon set to 0.05, so 5 % of the time, the agent will try to explore rather than exploit

In [12]:
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon: #If the random number is less than the epsilon value we choose to explore
        action = env.action_space.sample()
    else: # Else the agent exploits using the existing Q table
        action = np.argmax(Q[state, :])
    return action

In [13]:
def learn(state, stateNext, reward, action, actionNext):
    predict = Q[state, action]
    target = reward + gamma * Q[stateNext, actionNext]
    Q[state, action] = Q[state, action] + alpha * (target - predict)

In [14]:
total_episodes = 10000; max_steps = 100; epsilon = 0.05; alpha = 0.618; gamma = 0.9
Q = np.zeros((env.observation_space.n, env.action_space.n))

In [15]:
rewards = 0
for episode in range(total_episodes):
    counter = 0
    state = env.reset()
    action = choose_action(state)
    done = False
    #while counter < max_steps:
    while done != True:
        stateNext, reward, done, info = env.step(action)
        actionNext = choose_action(stateNext)
        learn(state, stateNext, reward, action, actionNext)
        state = stateNext
        action = actionNext

In [16]:
state = env.reset()
done = None

In [18]:
counter = 0
while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    counter = counter + 1
    env.render()

+---------+
|[35mR[0m: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : 

In [19]:
print("No of steps: ", counter)
print("Total reward: ",reward)

No of steps:  18
Total reward:  20
