# Q Learning with Numpy

In this code, I am using the Q Learning Algorithm to solve a Maze Problem. The following numbers are used to represent the grid world:

1. **+1** is used for the **AGENT**.
2. **+10** is used for the **GOAL**.
3. **-10** is used for the **KILLER TRAP**.
4. **-1** is used for **SMALL TRAP**.
5. **+2** is used for **SMALL REWARDS**.
6. **0** is used for **EMPTY SPACE**.

The **AGENT**, **GOAL**, **KILLER TRAP**, **SMALL TRAPS**, and **SMALL REWARDS** are all placed randomly in the grid. The main objective of this code is not to find the optimal actions, instead I just want to implement simple Q-Learning Algorithm from Scratch.

### 1. Creating the Grid World

In [325]:
import numpy as np

# Initialize Random Grid
height = np.random.randint(3,5)
width = np.random.randint(2,5)
grid = np.zeros((height,width))

# Random Robot Coordinates
robot_x,robot_y = (np.random.randint(0,height),np.random.randint(0,width))

# Rondom Goal Coordinates
while(True):
    goal_x,goal_y = (np.random.randint(0,height),np.random.randint(0,width))
    if grid[goal_x][goal_y] != 0:
        continue
    else:
        break
        
# Killer Trap Coordinates
while(True):
    killer_x,killer_y = (np.random.randint(0,height),np.random.randint(0,width))
    if grid[killer_x][killer_y] != 0:
        continue
    else:
        break
        
# Negative Reward Coordinates
total_traps = height
trap_coordinates = []
for i in range(total_traps):
    a = np.random.randint(0,height)
    b = np.random.randint(0,width)
    if grid[a][b] == 0:
        grid[a][b] = -1
    else:
        i-=1;

# Positive Reward Coordinates
total_traps = height 
trap_coordinates = []
for i in range(total_traps):
    a = np.random.randint(0,height)
    b = np.random.randint(0,width)
    if grid[a][b] == 0:
        grid[a][b] = 2
    else:
        i-=1;

grid[robot_x][robot_y] = 1
grid[goal_x][goal_y] = 10
grid[killer_x][killer_y] = -10

grid

array([[  1.,  10.],
       [  0.,   0.],
       [ -1., -10.],
       [  0.,   2.]])

### 2. Initializing the Q Table

In [326]:
import pandas as pd
indexes = []
states = []
for i in range(height):
    for j in range(width):
        indexes.append((i,j))

states = [i for i in range(height*width)]
        
df = pd.DataFrame({"States":states,"Coordinates":indexes,"UP":np.zeros(height*width),"DOWN":np.zeros(height*width),"LEFT":np.zeros(height*width),"RIGHT":np.zeros(height*width)})
df = df.set_index(df.States)
df.drop(['States'],axis = 1,inplace = True)
df

Unnamed: 0_level_0,Coordinates,UP,DOWN,LEFT,RIGHT
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(0, 0)",0.0,0.0,0.0,0.0
1,"(0, 1)",0.0,0.0,0.0,0.0
2,"(1, 0)",0.0,0.0,0.0,0.0
3,"(1, 1)",0.0,0.0,0.0,0.0
4,"(2, 0)",0.0,0.0,0.0,0.0
5,"(2, 1)",0.0,0.0,0.0,0.0
6,"(3, 0)",0.0,0.0,0.0,0.0
7,"(3, 1)",0.0,0.0,0.0,0.0


### 3. Defining Basic Functions

In [327]:
def get_reward(coordinates):
    return grid[coordinates]

def get_action(Q_values,epsilon):
    if np.random.rand() < epsilon:
        selected_action = np.random.randint(4)
    else:
        selected_action = np.argmax(Q_values)

    return selected_action
    
def get_state():
    return np.random.randint(0,height*width)

def take_action(coordinates,action):
    x,y = coordinates

    if action == 0:
        if x != 0:
            x-=1
    elif action == 1:
        if x != height-1:
            x+=1
    elif action == 2:
        if y != 0:
            y-=1
    else:
        if y != width-1:
            y+=1

    reward = get_reward((x,y))
    new_state = df[df.Coordinates == (x,y)].index[0]

    return reward, new_state
    

### 4. Training the Agent

In [328]:
# Hyperparameters
epsilon = 1.0
lr = 0.1
gamma = 0.99
episodes = 1000


# Training
for i in range(episodes):
    state = get_state()
    coordinates = df.iloc[state].Coordinates
    q_values = df.iloc[state].values[1:]
    current_value = grid[coordinates]
    total_reward = 0
    total_steps = 0
    while (current_value != -10 and current_value != 10 and total_reward < width*10 and total_steps < height + width):
        action = get_action(q_values,epsilon)
        reward,new_state = take_action(coordinates,action)
        current_q_value = q_values[action]
        new_q_value = current_q_value + lr*(reward + gamma * np.max(df.iloc[new_state].values[1:]) - current_q_value)
        df.iat[state,action+1] = new_q_value
        state = new_state
        total_reward += reward
        total_steps += 1

        

    epsilon -= 0.05
    

### 5. Final Q Table

In [329]:
df

Unnamed: 0_level_0,Coordinates,UP,DOWN,LEFT,RIGHT
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(0, 0)",14.567467,0.099,0.0,0.099
1,"(0, 1)",127.964165,-1.0,0.0,0.099
2,"(1, 0)",14.559103,0.099,0.0,0.0
3,"(1, 1)",127.964165,-1.71093,0.0,0.0
4,"(2, 0)",-0.1,0.0,-0.1,0.0
5,"(2, 1)",-0.998913,0.2,0.010977,-1.0
6,"(3, 0)",-0.19,0.0,-0.19,-1.0
7,"(3, 1)",-0.9802,24.368218,0.0,0.0


### 6. Testing the Agent

In [331]:
coordinates = (robot_x,robot_y)
initial_state = df[df.Coordinates == coordinates].index[0]
current_value = grid[coordinates]
total_reward = 0
total_steps = 0
state_action_new_state_tuples = []
while (current_value != -10 and current_value != 10 and total_reward < width*10 and total_steps < height + width):
    action = np.argmax(df.iloc[initial_state].values[1:])
    reward,new_state = take_action(coordinates,action)
    total_reward += reward
    total_steps += 1
    current_state = initial_state
    initial_state = new_state
    state_action_new_state_tuples.append((current_state,action,new_state))
    
    

if current_value == -10:
    print("The Agent fell in the trap")
elif current_value == 10 and total_steps < height + width:
    print("The Agent successfully reached the Goal")
else:
    print("The Agent couldn't reach the Goal")

print("Total Steps: ",total_steps)
print("Total Reward: ",total_reward)
print("State-Action-Reward-New State Tuples: ",state_action_new_state_tuples)



The Agent couldn't reach the Goal
Total Steps:  6
Total Reward:  6.0
State-Action-Reward-New State Tuples:  [(0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0)]
