### Article this is derived from : http://amunategui.github.io/reinforcement-learning/

### Load Libs

In [114]:
import numpy as np
import pylab as plt
import random

### Implement a simple Q Learner

In [115]:
# List of points that are mapped to each other. 
# Our goal is to get from any point to the goal point, in this case 5

points_list = [(0,4),(4,3),(2,3),(1,3),(4,5),(1,5)]

In [116]:
## Rewards Matrix 

# We have 6 possible states and 6 possible actions
# We have 6 actions because, lets say from 3, we can choose an action to go to state 2, 1 or 4. 
# Lets look at it another way, we have the following possible actions
# Up, Down, Front, back, slant in, slant out ( Ok bad choice of names, but I am creatively impaired, sooo ...)
MATRIX_SIZE = 6
goal = 5

R = np.matrix(np.ones(shape=[MATRIX_SIZE,MATRIX_SIZE]))
R = R * -1 # We assign -1 to every value

for points in points_list:
    R[points] = 0
    if points[1] == goal:
        R[points] = 100
    points = points[::-1]
    R[points] = 0

R[goal,goal] = 100

In [117]:
## Q matrix : Our brain
# Initially, we will have an empty Q matrix, which as we iterate through the different possible states and actions
# the RL process will populate with values it can use to make decisions. 

Q = np.matrix(np.zeros(shape=[MATRIX_SIZE,MATRIX_SIZE]))

### Lets look at Rewards and Q Table

In [118]:
print("The rewards Matrix is \n \n {}".format(R))
print("\n ")
print("The Q table Matrix is \n \n {}".format(Q))


The rewards Matrix is 
 
 [[ -1.  -1.  -1.  -1.   0.  -1.]
 [ -1.  -1.  -1.   0.  -1. 100.]
 [ -1.  -1.  -1.   0.  -1.  -1.]
 [ -1.   0.   0.  -1.   0.  -1.]
 [  0.  -1.  -1.   0.  -1. 100.]
 [ -1.   0.  -1.  -1.   0. 100.]]

 
The Q table Matrix is 
 
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [119]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "http://mnemstudio.org/ai/path/images/map3a.gif")

In [120]:
# Now let us create the structure of the simple Q Learner
print("1) Please refer to the image above for better clarity \n")

# Current State
current_state = 1

# Let us get all possible actions for current state
# From the rewards function, we can see which direction we can go in 

def available_actions(state):
    rewards_row_state = R[state,]
    av_actions = np.where(rewards_row_state >= 0)[1] # In our case, any value ge 0 is a possible action we can take from our current state
    return av_actions
    
actions = available_actions(current_state)
print("2) As we can see, the available actions from state {} is {} \n".format(current_state,actions))

# Lets say, we choose one action from the available actions, so we choose randomly

def choose_random_action(actions):
    action = random.choice(actions)
    return action

action = choose_random_action(actions)
print("3) The action we have chosen to take is to go to {}, from state {} \n".format(action,current_state))

# we now have information about our current state, current action we are taking, 
# and possible next states and actions we can take

gamma = 0.8

print("4) Here we have pause a little and introduce a hyper parameter called Gamma")
print("   Gamma is a paramter that is used to control if the model needs to focus on immediate reward or future rewards \n")

# Now we develop our Q update function

print("5) The Equation for our Q learner is as follows")
print("   Q[state,action] = R[state,action] + gamma * Max(Q[next state,all_actions])")

def Q_update(state,action,gamma):
    # First the max Q part
    Max_Q = np.max(Q[action,])
    Q[state,action] = R[state,action] + gamma * Max_Q
    return Q

Q = Q_update(current_state,action,gamma)



1) Please refer to the image above for better clarity 

2) As we can see, the available actions from state 1 is [3 5] 

3) The action we have chosen to take is to go to 3, from state 1 

4) Here we have pause a little and introduce a hyper parameter called Gamma
   Gamma is a paramter that is used to control if the model needs to focus on immediate reward or future rewards 

5) The Equation for our Q learner is as follows
   Q[state,action] = R[state,action] + gamma * Max(Q[next state,all_actions])


### Now we create a Monte Carlo simulation of the Markov Decision Process to converge this Q matrix (Our Brain)

In [162]:
for i in range(100):
    current_state = random.choice([0,1,2,3,4,5]) # Step 1
    actions = available_actions(current_state) # Step 2
    action = choose_random_action(actions) # Step 3
    Q = Q_update(current_state,action,gamma) # Step 4
    
# We can normalize Q Matrix values for a better idea of what is going on

Q = Q/np.max(Q)*100

### Lets take a look at the Image again

In [163]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "http://mnemstudio.org/ai/path/images/map3a.gif")

In [166]:
current_state = 4
step = [current_state]

while current_state != goal:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    step.append(next_step_index[0])
    
    if len(next_step_index) > 1:
        next_step_index = random.choice(next_step_index)
    else:
        next_step_index = int(next_step_index)
    
    current_state = next_step_index
        
print("The most efficient path from state {} is {}".format(step[0],step))

The most efficient path from state 4 is [4, 5]
