# Machine Learning - Reinforcement Learning

# Q-Learning Algorithm

In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
rewards = np.array([
    [-1, -1, -1, -1, 0, -1],
    [-1, -1, -1, 0, -1, 100],
    [-1, -1, -1, 0, -1, -1],
    [-1, 0, 0, -1, 0, -1],
    [0, -1, -1, 0, -1, -1],
    [-1, 0, -1, -1, -1, -1],
])

In [3]:
df_rewards = pd.DataFrame(rewards)
df_rewards

Unnamed: 0,0,1,2,3,4,5
0,-1,-1,-1,-1,0,-1
1,-1,-1,-1,0,-1,100
2,-1,-1,-1,0,-1,-1
3,-1,0,0,-1,0,-1
4,0,-1,-1,0,-1,-1
5,-1,0,-1,-1,-1,-1


In [4]:
def initialize_q(m, n):
    return np.zeros((m, n))

q_matrix = initialize_q(6, 6)
pd.DataFrame(q_matrix)

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0


## Let implement the functions

In [5]:
def set_initial_state(rooms = 6):
    # set the initial state/room the agent initializes in
    return np.random.randint(0, rooms)



def get_action(current_state, reward_matrix):
    # given a state, choose from the possible actions
    valid_actions = []
    for action in enumerate(reward_matrix[current_state]):
        if action[1] != -1:
            valid_actions += [action[0]]
            
    return random.choice(valid_actions)



def take_action(current_state, reward_matrix, gamma, verbose = False):
    # take a single action
    
    action = get_action(current_state, reward_matrix)
    sa_reward = reward_matrix[current_state, action] # current state- action reward
    ns_reward = max(q_matrix[action,]) # next state - action reward
    q_current_state = sa_reward + (gamma * ns_reward)
    q_matrix[current_state, action] = q_current_state # mutates q-matrix
    new_state = action
    
    if verbose:
        print(q_matrix)
        print(f"Old state: {current_state} | New state: {new_state}\n\n")
        
        if new_state == 5:
            print(f"Agent has reached its goal !")
        
    return new_state

    
    
def initialize_episode(reward_matrix, initial_state, gamma, verbose = False):
    
    # run 1 episode (or simulation) untill the agent reaches its goal state
    
    current_state = initial_state
    while True: # we don't use current_state == 5 because it is possible to initialize in 5
        current_state = take_action(current_state, reward_matrix, gamma, verbose)
        if current_state == 5:
            break
            
            
            
def train_agent(iterations, reward_matrix, gamma, verbose = False):
    # runs given number of episodes then normalizes the matrix
    
    print("Training in progress...")
    for episode in range (iterations):
        initial_state = set_initial_state()
        initialize_episode(reward_matrix, initial_state, gamma, verbose)
    print("Training complete")
    
    return q_matrix



def normalize_matrix(q_matrix):
    normalized_q = q_matrix / max(q_matrix[q_matrix.nonzero()]) * 100
    return normalized_q.astype(int)

In [6]:
# test run of single episode
rewards = np.array([
    [-1, -1, -1, -1, 0, -1],
    [-1, -1, -1, 0, -1, 100],
    [-1, -1, -1, 0, -1, -1],
    [-1, 0, 0, -1, 0, -1],
    [0, -1, -1, 0, -1, -1],
    [-1, 0, -1, -1, -1, -1],
])

gamma = 0.1
initial_state = set_initial_state()
initial_action = get_action(initial_state, rewards)

initialize_episode(rewards, initial_state, gamma, True)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 3 | New state: 2


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 2 | New state: 3


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 3 | New state: 2


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 2 | New state: 3


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 3 | New state: 1


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old state: 1 | New state: 3


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.

In [7]:
# test run of full training (training = 2000 iterations)
rewards = np.array([
    [-1, -1, -1, -1, 0, -1],
    [-1, -1, -1, 0, -1, 100],
    [-1, -1, -1, 0, -1, -1],
    [-1, 0, 0, -1, 0, -1],
    [0, -1, -1, 0, -1, -1],
    [-1, 0, -1, -1, -1, -1],
])

gamma = 0.8
initial_state = set_initial_state()
initial_action = get_action(initial_state, rewards)

q_table = train_agent(2000, rewards, gamma, False)

# In DataFrame for display purpose
pd.DataFrame(q_table)

Training in progress...
Training complete


Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.0,142.222222,0.0
1,0.0,0.0,0.0,177.777778,0.0,277.777778
2,0.0,0.0,0.0,177.777778,0.0,0.0
3,0.0,222.222222,142.222222,0.0,142.222222,0.0
4,113.777778,0.0,0.0,177.777778,0.0,0.0
5,0.0,222.222222,0.0,0.0,0.0,0.0


In [8]:
pd.DataFrame(normalize_matrix(q_table))

Unnamed: 0,0,1,2,3,4,5
0,0,0,0,0,51,0
1,0,0,0,64,0,100
2,0,0,0,64,0,0
3,0,80,51,0,51,0
4,40,0,0,64,0,0
5,0,80,0,0,0,0


In [9]:
def deploy_agent(init_state, q_table):
    print("Start : ", init_state)
    state = init_state
    steps = 0
    while True:
        steps +=1
        action = np.argmax(q_table[state,:])
        print("Actions: ", action)
        state = action
        if action == 5:
            print("Finished")
            return steps

In [10]:
start_room = 1
steps = deploy_agent(start_room, q_table)
print("Number of rooms/actions: ", steps)

Start :  1
Actions:  5
Finished
Number of rooms/actions:  1
