<a href="https://colab.research.google.com/github/TRo202/Reinforcement-Learning/blob/main/Rooms_(RL_Practice_ex).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a simple Q-learning practice.    
Source: [niconielsen32](https://github.com/niconielsen32)

Goal: 
Given a grid of rooms  
0 1 5(goal)     
4 3 2   
where rooms are connected in a way that rooms 0-4-3-1-5 are connected and rooms 2-3-1-5 are connected.
Reach the room #5 (goal)



In [None]:
import numpy as np
import pandas as pd
import random

In [None]:
rewards=np.array([
    [-1,-1,-1,-1,0,-1],
    [-1,-1,-1,0,-1,100],
    [-1,-1,-1,0,-1,-1],
    [-1,0,0,-1,0,-1],
    [0,-1,-1,0,-1,-1],
    [-1,0,-1,-1,-1,-1]])

In [None]:
def intialize_q(m,n):
    return np.zeros((m,n))
q_matrix=intialize_q(6,6)

In [None]:
q_matrix

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
def set_initial_state(state=6):
    return np.random.randint(0,state)

In [None]:
def get_action(current_state, reward_matrix):
    valid_actions=[]
    for action in enumerate(reward_matrix[current_state]):  # in the current_state row of rewards_matrix, when action[1] ("validation value"-whether reachable)is not equal to -1, substitue valid action in action[0] (path)
        if action [1] !=-1:  
            valid_actions+=[action[0]]                      # for a in enumerate(j) print(a) -> returns (0, j[0]); (1, j[1]); ...; (n, j[n]) 
                                                            # why use "random.choice()"?  why not just "return action[0]"
    return random.choice(valid_actions)                     # REASON
                                                            # don't forget that there can be more than one choice (door) for one room

In [None]:
# To take some action, we need to know the current state

def take_action(current_state, reward_matrix, gamma, verbose=False):
    action= get_action(current_state, reward_matrix)
    sa_reward= reward_matrix[current_state, action] # current_state-action reward   # immediate reward
    ns_reward= max(q_matrix[action,])# future new state reward                      # returns the max Q value for new state (= most recent action)
    q_current_state= sa_reward+(gamma*ns_reward)                                    # Q value = immeidate reward + (gamma*future reward)
    q_matrix[current_state, action]=q_current_state #matutes q_matrix
    new_state= action                                                               # new state = most recent action
    if verbose:
        print(q_matrix)
        print(f"Old State: {current_state} | New State: {new_state}\n\n")
        if new_state== 5:
            print(f"Agent has reached it's goal!")
    return new_state

def initialize_episode(reward_matrix, initial_state, gamma, verbose=False):
    #Runs 1 episode unitl the agent reaches its goal-state
    current_state = initial_state
    while True: 
        current_state= take_action(current_state, reward_matrix, gamma, verbose)    # current_state = new_state (returned by take_action(...))
        if current_state ==5:
            break

def train_agent(iteration, reward_matrix, gamma, verbose=False):
    #Runs a given number of episodes then normalizs the matrix
    print("Training in progress...")
    for episode in range(iteration):
        initial_state =set_initial_state()
        initialize_episode(reward_matrix, initial_state, gamma, verbose)
    print("Training complete!")
    return q_matrix

def normalize_matrix(q_matrix):
    normalize_q=q_matrix/max(q_matrix[q_matrix.nonzero()])*100
    return normalize_q.astype(int)

In [None]:
#Test run of single episode....
gamma=0.1
initial_state=0
initial_action=get_action(initial_state, rewards)
initialize_episode(rewards, initial_state, gamma, True)
print(q_matrix)


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old State: 0 | New State: 4


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old State: 4 | New State: 3


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Old State: 3 | New State: 1


[[  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0. 100.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]]
Old State: 1 | New State: 5


Agent has reached it's goal!
[[  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0. 100.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]]


In [None]:
#Test run of full training
gamma=0.8
initial_state=set_initial_state()
initial_action=get_action(initial_state, rewards)       # why do we need "initial_state=set_initial_state()" and "initial_action=get_action(initial_state, rewards)"
q_table=train_agent(200, rewards, gamma, False)        # if we already define initial_state in train_agent for every episode?
print(pd.DataFrame(q_table))


Training in progress...
Training complete!
            0           1           2           3           4           5
0    0.000000    0.000000    0.000000    0.000000  142.221391    0.000000
1    0.000000    0.000000    0.000000  177.777352    0.000000  277.777113
2    0.000000    0.000000    0.000000  177.777352    0.000000    0.000000
3    0.000000  222.221690  142.221391    0.000000  142.221882    0.000000
4  113.776738    0.000000    0.000000  177.777352    0.000000    0.000000
5    0.000000  222.221391    0.000000    0.000000    0.000000    0.000000


In [None]:
def deploy_agent(init_state, q_table):
  print("Start: ", init_state)
  state = init_state
  steps = 0
  while True:
    steps += 1
    action = np.argmax(q_table[state,:])
    print(action)
    state = action
    if action == 5:
      print("Finished!")
      return steps

In [None]:
start_room = 0
steps = deploy_agent(start_room, q_table)
print("Number of Rooms/actions: ", steps)

Start:  0
4
3
1
5
Finished!
Number of Rooms/actions:  4
