## Problem statement: Use reinforcement learning to train a taxi(agent) to pick and drop off passenger autonomously.

In [1]:
# Importing Libraries
import gym
import numpy as np
import pickle, os

In [2]:
# Import taxi environment from OpenGym. 
env = gym.make("Taxi-v3")

In [3]:
#Getting random state
state = env.reset()

In [4]:
state

31

In [5]:
#Visualizing the state.
env.render()

+---------+
|R:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



<h1>Possible Actions</h1>

down (0), up (1), right (2), left (3), pick-up (4), and drop-off (5)

In [6]:
#Getting the number of states and action
n_states = env.observation_space.n
n_actions = env.action_space.n

In [7]:
n_actions

6

In [8]:
n_states

500

In [9]:
#Moving to state number 254
env.env.s = 254

In [10]:
env.render()

+---------+
|R: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [11]:
#Take action 3 -> Move west.
env.step(3)

(234, -1, False, {'prob': 1.0})

In [12]:
#From the visualization it is evident that the taxi has moved one step west
env.render()

+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)


<h1>How good does behaving completely random do?</h1>

In [13]:
#Testing how good can random actions helps in agent learning
state = env.reset()
counter = 0
g = 0
reward = None

In [14]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |B: |
+---------+



In [15]:
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward

In [16]:
print("Solved in {} Steps with a total reward of {}".format(counter,g))

Solved in 7930 Steps with a total reward of -31003


#### The total cumulative reward is a large negative number, we can improve this using the Q learning algorithm.

## Let's look at just one episode and see how the Q values change after each step using the formula below

In [17]:
Q = np.zeros([n_states, n_actions]) #Creating a Q table with no.of states as rows and no. of actions as columns.

In [18]:
print("No. of rows:",Q.shape[0])
print("No. of colums:",Q.shape[1])

No. of rows: 500
No. of colums: 6


In [19]:
episodes = 1000 #Setting the value of episodes to be 1000
max_steps = 100 #This variable can be used to train the agent within the given number of steps.
alpha = 0.618 #Hyperparam
gamma = 0.96 #Hyperparam
G = 0 # Current reward

In [20]:
#This for loop represents the Q learning algorithm.
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0 #Setting current reward and cumulative reward to zero.
    state = env.reset() #Resetting the environment
    firstState = state
    noSteps = 0
    #print("Initial State = {}".format(state))
    while done != True: # For each episode the agent trains only for the maximum number of steps
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action)
        #Q learning formula
        Q[state,action] = Q[state,action] +  alpha * (reward + gamma*np.max(Q[state2]) - Q[state,action]) 
        G += reward
        state = state2
        noSteps = noSteps + 1
        if(noSteps >= max_steps ):
            done = True

In [21]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-5.23787   , -5.53595091, -5.68506717, -5.53595091,  6.16402981,
        -6.18      ],
       [-3.74530212, -4.01807953, -3.94971586, -4.01807953, 11.69177139,
        -6.18      ],
       ...,
       [-2.94096042, -3.10822542, -2.94096042, -3.02144421, -6.18      ,
        -6.18      ],
       [-4.01807953, -3.23598551, -4.01807953, -3.93460762, -6.18      ,
        -6.18      ],
       [-1.22072304, -1.22072304, -1.22072304,  6.4788648 , -6.18      ,
        -6.18      ]])

In [22]:
state = env.reset()
done = False

In [23]:
counter = 0
#reward = 0
while done != True:
    action = np.argmax(Q[state])
    state,reward,done,info = env.step(action)
    env.render()
    counter = counter + 1

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| |

In [24]:
print("No of Steps:", counter)

No of Steps: 13


In [25]:
print("Total reward:", reward)

Total reward: 20


### Hence there is a stark difference between choosing a random approach vs Q learning algorithm. The Taxi(agent) completed the trip in much fewer steps and the cummulative reward is also positive