# Q-Learning based RL agent

Requirements:
>Numpy

We're going to create two different environments for our QL agent 
> A pickup and delivery env 

> A maze env


In [23]:
import numpy as np
from random import randint, random

# The Pickup and Delivery environment

#### Actions:
0 : move up

1 : move down

2 : move left

3 : move right

4 : pick up

5 : deliver

reward for legal moves : -1

reward for trying to go off the boundary : -30

reward for trying to pickup a thing when it has already been picked up or if it was not in place : -10

reward for picking the item correctly : 50

reward for deliverying correctly : 50

reward for trying to deliver at the wrong place : -30

In [24]:
class PDE:
    
    def __init__(self,size,pickup,delivery,agent_position):
        self.size = size
        self.pickup = pickup
        self.delivery = delivery
        self.position = agent_position
        
        self.item_picked_up = False 
        
    def number_of_states_possible(self):
        return self.size**4 * 2
    
    def state(self):
        states = self.position[0]* self.size ** 3 * 2
        states += self.position[1]* self.size ** 2 * 2
        states += self.pickup[0]* self.size** 1 * 2
        states += self.pickup[1]* 2

        if self.item_picked_up:
            states += 1
        return states
        
    
    def act(self,action):
        (x,y) = self.position
        
        if action == 0:  # go up
            if y == 0 : 
                return -30 , False 
            else :
                self.position = (x,y-1)
                return -1, False
            
        elif action == 1:  # go down
            if y == self.size -1 :
                return -30 , False 
            else :
                self.position = (x,y+1)
                return -1 , False
            
        elif action == 2:  # go left
            if x == 0:
                return -30 , False 
            else : 
                self.position = (x-1,y)
                return -1, False 
             
        elif action == 3:  # go right
            if x == self.size -1:
                return -30, False 
            else :
                self.position = (x+1,y)
                return -1 , False 
            
        elif action == 4 :  # pickup item
            if self.item_picked_up:
                return -10 , False 
            elif (x,y) != self.pickup:
                return -10, False 
            else:
                self.item_picked_up = True
                return 50, False 
            
        else : # deliver item
            if self.item_picked_up and self.position == self.delivery:
                return 50 , True
            elif self.item_picked_up and self.position != self.delivery:
                self.pickup = (x,y)
                self.item_picked_up = False
                return -30 , False 
            else:
                return -30, False 
            
            
    
#     def naive_sloution_to_test_actions(self,):
            

In [25]:
env = PDE(10,(0,0),(9,9),(9,0))

# There can be different states in the env
What are the states variables here?? 
> the start , the end , item_picked_up , the agent position

In [26]:
print(env.state())

18000


In [27]:
env.number_of_states_possible()

20000

#### Algorithm
- Initialise the **Q-table** to all zeros
- Iterate
    - Agent is in state **state**.
    - With probability **epsilon** choose to **explore**, else **exploit**.
        - If **explore**, then choose a *random* **action**.
        - If **exploit**, then choose the *best* **action** based on the current **Q-table**.
    - Update the **Q-table** from the new **reward** to the previous state.
    - Q[**state, action**] = (1 – **alpha**) * Q[**state, action**] + **alpha** * (**reward + gamma** * max(Q[**new_state**]) — Q[**state, action**])

In [28]:
def init_q_table(ns, na):
    return np.zeros((ns,na))

In [30]:
class QLearn(PDE):
    
    def __init__(self, ns , na ):
        self.ns = ns
        self.na = na
        self.qtable = np.zeros((self.ns,self.na))
        
    
    def update_q_table(self,lr , dr , max_of_new_state , reward , state , action , qtable ): 
        self.qtable[state,action] = (1-lr) * self.qtable[state,action] + lr * (reward + dr * max_of_new_state - self.qtable[state,action] )
    
    
    def fit(self, state , action,epsilon = 0.1 ,gamma = 0.6 , alpha = 0.1 ,verbose = False  ):
        
        done = False 
        while not done:
            if random() < epsilon:
                action = randint(0,5)
            else:
                action = np.argmax(qtable[state])
            
            
    
    
    def save_q_table(self):
        pass
    
    def use_q_tablq(self):
        pass
    
            
        

# Rough implementation of qlearning


In [31]:
env = PDE(5,(0,0),(4,5),(3,0))
done = False 

na = 6
ns = env.number_of_states_possible()
q_table = np.zeros((ns,na))

epsilon = 0.1
gamma = 0.1
alpha = 0.9

steps = 0 
for i in range(1):
    env = PDE(5,(0,0),(4,5),(3,0))
    done = False 
    while not done :

        state = env.state()

        if random() < epsilon:
            action = randint(0,5)
        else:
            action = np.argmax(q_table[state])
        reward ,  done = env.act(action)
        new_state = env.state()
        q_table[state, action ] = (1-alpha) * q_table[state, action] + alpha*(reward+gamma* np.max(q_table[new_state]) - q_table[state,action])

        steps += 1

KeyboardInterrupt: 

In [None]:
print(steps)