In [2]:
# Environment class keeping track of all actions/positions
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size #grid size
        self.item_pickup = item_pickup #item pickup coardinate
        self.item_dropoff = item_dropoff #item dropoff coardinate
        self.position = start_position #agent position at start when init
        self.item_in_car = False

    def get_number_of_states(self):
        return self.size**4 * 2 #no. of states: 
                                    #item_pickup 10*10 locations
                                    #item_dropoff 10*10 locations 
                                    #item_in_car 2 possibilites

    def get_state(self): #assigning the state it y-axis value in q-table (20000)
        state = self.position[0] * self.size**3 * 2          #lets say player position is at (9,0) and item at (0,0) 
        state = state + self.position[1] * self.size**2 * 2  #we are taking all the possibilities and numbering the states
        state = state + self.item_pickup[0] * self.size * 2  #and assigning the state value with respect to player pos
        state = state + self.item_pickup[1] * 2

        if self.item_in_car:
            state = state + 1
        return state

    def make_action(self,action):
        (x,y)= self.position
        if action == 0:   # down
            if y == self.size -1:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x, y+1)
                return -1, False
            
        elif action == 1: # up
            if y == 0:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x, y-1)
                return -1, False
            
        elif action == 2: # left
            if x == 0:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x-1, y)
                return -1, False
            
        elif action == 3: # right
            if x == self.size -1:
                return -10, False #Reward,Reached Goal
            else: 
                self.position = (x+1, y)
                return -1, False
            
        elif action == 4: # pickup
            if self.item_in_car: #item already in car
                return -10, False
            elif self.item_pickup != (x,y): #item not at current position
                return -10, False
            else:
                self.item_in_car = True #item at playerpos and item not in car
                return +20, False
            
        elif action == 5: # dropoff
            if not self.item_in_car: #item not in car and try to dropoff
                return -10, False
            elif self.item_dropoff != (x,y): #item in car but drop off wrong location
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else: #item in  car and correct dropoff location
                self.item_in_car = False
                return 20, True

In [41]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)

In [42]:
field.position #players current position

(9, 0)

In [44]:
#Playing game Manually
for _ in range(0,9): # 9 loops
    field.make_action(2) # go left 9 times

field.make_action(4) #pickup item

for _ in range(0,9): # 9 loops
    field.make_action(0) # go down 9 times
    
for _ in range(0,9): # 9 loops
    field.make_action(3) #go right 9 times

reward,done = field.make_action(5) #drop off at correct position

reward, done, field.item_in_car #item no longer in car


(20, True, False)

In [2]:
import random

In [45]:
def random_solution():
    
    field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)
    done = False
    steps = 0

    while not done:
        action = random.randint(0,5)
        reward,done = field.make_action(action)
        steps = steps + 1

    return steps

In [49]:
random_solution()

119248

In [46]:
run = [random_solution() for _ in range(100)]

In [47]:
sum(run)/len(run) #average runs

157643.74

In [64]:
import numpy as np

In [65]:
# Q-Learning Algo
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)

number_of_states = field.get_number_of_states()
number_of_actions = 6 #0-5

q_table = np.zeros((number_of_states,number_of_actions))

#Hyperparameters, dont worry about them now
epsilon = 0.1
alpha = 0.1
gamma = 0.6

for _ in range(10000): #agent plays game 10000 times and updates q-table (Training Phase)
    field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)
    done = False

    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) #Exploration
        else:
            action = np.argmax(q_table[state]) #Exploitation

        reward, done = field.make_action(action)
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])

        q_table[state,action] = (1-alpha)*q_table[state,action] + alpha*(reward + gamma*new_state_max - q_table[state,action])





In [66]:
q_table , q_table.shape

(array([[ 0.23071429, -2.06428571, -2.06428571,  0.23071429,  9.78571429,
         -2.06428571],
        [-0.71428571, -5.21428571, -5.21428571, -0.71428571, -5.21428571,
         -2.06428571],
        [ 1.13750493, -1.        , -1.        , -0.1       , -1.        ,
         -1.        ],
        ...,
        [-1.91530129,  0.11774602, -0.186     , -1.18383244, -1.46204729,
         10.43572895],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ]]),
 (20000, 6))

In [90]:
def reinforcement_solution():
    field = Field(size=size,item_pickup=item_pickup,item_dropoff=item_dropoff,start_position=start_position)
    
    epsilon = 0.1
    alpha = 0.1
    gamma = 0.6
    
    done = False
    steps = 0
    
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) #Explore
        else:
            action = np.argmax(q_table[state]) #Exploit
            
        reward, done = field.make_action(action)
        
        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*new_state_max - q_table[state, action])
                        
        steps = steps +1
        
    return steps

In [3]:
random.uniform(0,1)

0.2269799216496774

In [8]:
ep = 1 #initially epsilon is 1 so any valu would be less than 1 so mainly exploration will occur
explore = 0
exploit = 0

for index,i in enumerate(range(0,100)):
    if random.uniform(0,1) < ep:
        print(index,"-->Exploring") #
        # explore = explore + 1
    else:
        print(index,"-->Exploiting") #
        # exploit = exploit + 1

    ep = ep - 0.01
    # print(index, "Explore:", explore, "Exploit:", exploit) 

0 -->Exploring
1 -->Exploring
2 -->Exploring
3 -->Exploring
4 -->Exploring
5 -->Exploring
6 -->Exploring
7 -->Exploring
8 -->Exploring
9 -->Exploring
10 -->Exploring
11 -->Exploring
12 -->Exploring
13 -->Exploring
14 -->Exploring
15 -->Exploring
16 -->Exploring
17 -->Exploring
18 -->Exploiting
19 -->Exploring
20 -->Exploring
21 -->Exploring
22 -->Exploring
23 -->Exploring
24 -->Exploiting
25 -->Exploring
26 -->Exploring
27 -->Exploiting
28 -->Exploring
29 -->Exploiting
30 -->Exploring
31 -->Exploring
32 -->Exploring
33 -->Exploiting
34 -->Exploring
35 -->Exploring
36 -->Exploring
37 -->Exploring
38 -->Exploiting
39 -->Exploring
40 -->Exploring
41 -->Exploring
42 -->Exploiting
43 -->Exploring
44 -->Exploiting
45 -->Exploiting
46 -->Exploiting
47 -->Exploiting
48 -->Exploiting
49 -->Exploiting
50 -->Exploring
51 -->Exploring
52 -->Exploiting
53 -->Exploiting
54 -->Exploring
55 -->Exploiting
56 -->Exploiting
57 -->Exploiting
58 -->Exploiting
59 -->Exploring
60 -->Exploiting
61 -->Exploiti

In [107]:
reinforcement_solution()

32

In [108]:
# lets run it for another 100 time
run = [reinforcement_solution() for _ in range(10000)]

In [111]:
sum(run)/len(run)

44.7127

In [35]:
q_table

array([[ 0.23071429, -2.06428571, -2.06428571,  0.23071429,  9.78571429,
        -2.06428571],
       [-0.71428571, -5.21428571, -5.21428571, -0.71428571, -5.21428571,
        -2.06428571],
       [ 2.4348785 , -1.        , -1.        , -0.1       , -1.        ,
        -1.        ],
       ...,
       [-1.63093941,  0.20637121, -0.25135865, -1.80124927, -1.83279121,
        10.4707118 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])