In [1]:
import gym
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

#for text processing
import spacy
import re
import pandas as pd
env = gym.make("Taxi-v2").env


#### There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions."

### Fetching Origing, Destination, and Time of Pickup from the sms data 

In [2]:
def fetch_pickup_drop(sms):
    
    #Write your code here
    location_df = pd.read_csv('city.csv')
    cities = list(location_df['location'])
    sms_cities = []
    origin =[]
    dest = []
    time_of_pickup = []
    
    for city in cities:
        if city in sms:
            sms_cities.append(city)
            
    from_match = re.findall(r"from\s\w+\s\w+\s\w+" , sms)
    from_match = list(from_match)
    
    to_match = re.findall(r"to\s\w+\s\w+\s\w+" , sms)
    to_match = list(to_match)
    
    for city in sms_cities:
        if (len(from_match)>0):
            if(city in from_match[0]):
                origin.append(city)
                sms_cities.remove(city)
                dest.append(sms_cities[0])
        else:
            if (len(to_match)>0):
                if(city in to_match[0]):
                    dest.append(city)
                    sms_cities.remove(city)
                    origin.append(sms_cities[0])
    time = re.findall(r"(\d+ PM)|(\d+ AM)",sms)
    if time[0][0] == '':
        time_of_pickup = time[0][1]
    else:
        time_of_pickup = time[0][0]
    
    return [origin, dest, time_of_pickup]
    
#f = open("sms.txt", "r")
#num_of_lines = 1000
#for line in f:
#    fetch_pickup_drop(line)
    

In [3]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))
env

+---------+
|[35mR[0m: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


<gym.envs.toy_text.taxi.TaxiEnv at 0x7f2a26732940>

## Summing up the Q-Learning Process


Initialize the Q-table by all zeros.

Start exploring actions: 

For each state, select any one among all possible actions for the current state (S).

Travel to the next state (S') as a result of that action (a).

For all possible actions from the state (S') select the one with the highest Q-value.

Update Q-table values using the equation.

Set the next state as the current state.

If goal state is reached, then end and repeat the process.


## Exploiting learned values
After enough random exploration of actions, the Q-values tend to converge serving our agent as an action-value function which it can exploit to pick the most optimal action from a given state.

There's a tradeoff between exploration (choosing a random action) and exploitation (choosing actions based on already learned Q-values). We want to prevent the action from always taking the same route, and possibly overfitting, so we'll be introducing another parameter called ϵ "epsilon" to cater to this during training.

Instead of just selecting the best learned Q-value action, we'll sometimes favor exploring the action space further. Lower epsilon value results in episodes with more penalties (on average) which is obvious because we are exploring and making random decisions.

In [8]:
#Initialize Q_table
import numpy as np
#write your code here

q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [12]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

##Write your code here
for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")
np.save("./q_table.npy", q_table)

Episode: 100000
CPU times: user 44.4 s, sys: 6.45 s, total: 50.8 s
Wall time: 9min 15s


In [13]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.2732518 ,  -2.12208638,  -2.27325181,  -2.1220864 ,
         -1.870144  , -11.12208572],
       [ -1.870144  ,  -1.45024002,  -1.87014399,  -1.4502401 ,
         -0.7504    , -10.45023826],
       ...,
       [ -0.93352002,   0.416     ,  -1.05223237,  -1.19427342,
         -6.05058976,  -4.64766784],
       [ -2.146069  ,  -2.1220551 ,  -2.17056679,  -2.12205477,
         -3.69883525,  -3.72068937],
       [  3.10561091,   1.76835778,   2.75819295,  11.        ,
         -2.25208763,  -2.6470922 ]])

In [14]:
#Load trained q_table for evaluation
q_table = np.load("./q_table.npy")

In [15]:
def create_loc_dict(city_df):
    loc_dict = {}
    ## Create dictionary example, loc_dict['dwarka sector 23] = 0
    for index ,row in city_df.iterrows():
        loc_dict[row['location']] = row['mapping']     
    return loc_dict

In [16]:
def check_pick_up_drop_correction(pick_up, drop, line_num):
    #write your code here
    orig_df = pd.read_csv('org_df.csv')
    original_origin = orig_df.iloc[line_num]['origin']
    original_destination = orig_df.iloc[line_num]['dest']
    if original_origin == pick_up and original_destination == drop:
        return True
    else:
        return False
    

In [18]:
"""Evaluate agent's performance after Q-learning"""

# 1) We need to take text drom "sms.txt" and fetch pickup and drop from it.
# 2) Generate the random state from an enviroment and change the pick up and drop as the fetched one
# 3) Evaluate you q_table performance on all the texts given in sms.txt.
# 4) Have a check if the fetched pickup, drop is not matching with original pickup, drop using orig.csv
# 5) If fetched pickup or/and drop does not match with the original, add penality and reward -10
# 6) Calculate the Total reward, penalities, Wrong pickup/drop predicted and Average time steps per episode.

total_epochs, total_penalties, total_reward, wrong_predictions = 0, 0, 0, 0

info = pd.DataFrame(columns=['origin','destination','time'])
count = 0
time_list = []
f = open("./sms.txt", "r")
num_of_lines = 1000
city = pd.read_csv("./city.csv")

loc_dict = create_loc_dict(city)
line_num = 0
for line in f:
    l = fetch_pickup_drop(line)
    pick_up = l[0]
    drop = l[1]
    decision = check_pick_up_drop_correction(pick_up,drop,line_num)
    if not decision:
        total_penalties += 1
        reward = -10
        total_reward += reward
        wrong_predictions += 1
    pickUP_idx = loc_dict[pick_up[0]]
    drop_idx = loc_dict[drop[0]]
    act_state = env.reset()
    taxi_row, taxi_col,pick_up ,drop  = env.decode(act_state)
    state = env.encode(taxi_row,taxi_col,int(pickUP_idx),int(drop_idx))
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs
    total_reward += reward



print(f"Results after {num_of_lines} episodes:")
print(f"Average timesteps per episode: {total_epochs / num_of_lines}")
print(f"Average penalties per episode: {total_penalties / num_of_lines}")
print(f"Total number of wrong predictions ", wrong_predictions)
print("Total Reward is ", total_reward)

Results after 1000 episodes:
Average timesteps per episode: 13.43
Average penalties per episode: 1.03
Total number of wrong predictions  1000
Total Reward is  10000
