In [115]:
import numpy as np
import random

In [116]:
# Environment class keeping track of all actions/positions
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size #grid size
        self.item_pickup = item_pickup #item pickup coardinate
        self.item_dropoff = item_dropoff #item dropoff coardinate
        self.position = start_position #agent position at start when init
        self.item_in_car = False

    def get_number_of_states(self):
        return self.size**4 * 2 #no. of states: 
                                    #item_pickup 10*10 locations
                                    #item_dropoff 10*10 locations 
                                    #item_in_car 2 possibilites

    def get_state(self): #assigning the state it y-axis value in q-table (20000)
        state = self.position[0] * self.size**3 * 2          #lets say player position is at (9,0) and item at (0,0) 
        state = state + self.position[1] * self.size**2 * 2  #we are taking all the possibilities and numbering the states
        state = state + self.item_pickup[0] * self.size * 2  #and assigning the state value with respect to player pos
        state = state + self.item_pickup[1] * 2

        if self.item_in_car:
            state = state + 1
        return state

    def make_action(self,action):
        (x,y)= self.position
        if action == 0:   # down
            if y == self.size -1:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x, y+1)
                return -1, False
            
        elif action == 1: # up
            if y == 0:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x, y-1)
                return -1, False
            
        elif action == 2: # left
            if x == 0:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x-1, y)
                return -1, False
            
        elif action == 3: # right
            if x == self.size -1:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x+1, y)
                return -1, False
            
        elif action == 4: # pickup
            if self.item_in_car: #item already in car
                return -10, False
            elif self.item_pickup != (x,y): #item not at current position
                return -10, False
            else:
                self.item_in_car = True #item at playerpos and item not in car
                return +20, False
            
        elif action == 5: # dropoff
            if not self.item_in_car: #item not in car and try to dropoff
                return -10, False
            elif self.item_dropoff != (x,y): #item in car but drop off wrong location
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else: #item in  car and correct dropoff location
                self.item_in_car = False
                return 20, True

In [117]:
# Q-Learning Algo
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)

number_of_states = field.get_number_of_states()
number_of_actions = 6 #0-5

q_table = np.zeros((number_of_states,number_of_actions))

#Hyperparameters, dont worry about them now
epsilon = 0.1
alpha = 0.1
gamma = 0.6

In [139]:
def QL_solution():
    field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)
    
    epsilon = 0.1
    alpha = 0.1
    gamma = 0.6
    
    done = False
    steps = 0
    
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) #Explore
        else:
            action = np.argmax(q_table[state]) #Exploit
        # print (action) #Check action made
        reward, done = field.make_action(action)
        
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*new_state_max - q_table[state, action])
                        
        steps = steps +1
        
    return steps

In [154]:
run= [QL_solution() for _ in range(100000)]
sum(run)/len(run)


34.44567

In [173]:
QL_solution()

29