<a href="https://colab.research.google.com/github/Vey27/Reinforcement-Learning/blob/main/Q_Learning_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import random

In [None]:
#Setting rewards in the environment. Coded by Simulator Designer

# Goal --- 100 points
# Correct door = 0 point
# Reaching Wrong Door = -1 point
rewards=np.array([
                  [-1,-1,-1,-1,0,-1],
                  [-1,-1,-1,0,-1,100],
                  [-1,-1,-1,0,-1,-1],
                  [-1,0,0,-1,0,-1],
                  [0,-1,-1,0,-1,-1],
                  [-1,0,-1,-1,-1,-1]
])

In [None]:
rewards

array([[ -1,  -1,  -1,  -1,   0,  -1],
       [ -1,  -1,  -1,   0,  -1, 100],
       [ -1,  -1,  -1,   0,  -1,  -1],
       [ -1,   0,   0,  -1,   0,  -1],
       [  0,  -1,  -1,   0,  -1,  -1],
       [ -1,   0,  -1,  -1,  -1,  -1]])

In [None]:
#This function is coded by RL engineer with an intention to initialize the Q Table
# Here m is no of state and n is no of actions
def initialize_q(m,n):
  return np.zeros((m,n))

In [None]:
q_matrix = initialize_q(6,6)

q_matrix

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
#Coded by Design Team/ Coded by Simulator Designer
def set_initial_state(rooms=6):
  #set the initial state like env.reset()
  return np.random.randint(0,rooms)
  #return 0


In [None]:
#Done by the Design Team /  Coded by Simulator Designer
# Indirectly this is Exploration Code

def get_action(current_state, reward_matrix):
  #Given a state choose the possible actions
  valid_actions =[]
  for action in enumerate(reward_matrix[current_state]):
    if action[1] != -1:
      valid_actions += [action[0]]

  return random.choice(valid_actions)


In [None]:
#Coded by RL
# Similar to env.step()
# Applying Q Learning Algo
def take_action(current_state, reward_matrix, gamma, verbose=False):
  #Take Single action
  action = get_action(current_state, reward_matrix)
  sa_reward = reward_matrix[current_state,action] #current state-action reward
  ns_reward = max(q_matrix[action,]) #next state action reward
  #print(ns_reward)

  q_current_state = sa_reward + (gamma * ns_reward)
  q_matrix[current_state,action] = q_current_state #Update Q Matrix


  new_state = action

  if verbose:
    print(pd.DataFrame(q_matrix))
    print(f"Old State: {current_state} | New State: {new_state}\n\n")
    if new_state==5:
      print(f"Agent has reached his Goal!")

  return new_state


In [None]:
#Help you run one episode
def initialize_episode(reward_matrix, initial_state, gamma, verbose=False):
  #Runs one episode  until agent reaches its goal state
  current_state = initial_state
  while True:
    current_state = take_action(current_state, reward_matrix, gamma, verbose)
    if current_state==5:
      break

In [None]:
# Help you run several episodes defined in iterations
def train_agent(iterations, reward_matrix, gamma, verbose=False):
  #Runs given number of episodes
  print("Training in progress ...")

  for episodes in range(iterations):
    initial_state=set_initial_state()
    initialize_episode(reward_matrix, initial_state, gamma, verbose)

  print("Training complete !")

  return q_matrix

In [None]:
def normalize_matrix(q_matrix):
  normalized_q = q_matrix / max(q_matrix[q_matrix.nonzero()]) * 100
  return normalized_q

In [None]:
#Test Run

gamma = 0.8
initial_state = set_initial_state()
#print(initial_state)
initial_action =  get_action(current_state=initial_state, reward_matrix=rewards)
initialize_episode(rewards,initial_state, gamma , True)


     0    1    2    3    4    5
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0  0.0
Old State: 1 | New State: 3


     0    1    2    3    4    5
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0  0.0
Old State: 3 | New State: 1


     0    1    2    3    4    5
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0  0.0
Old State: 1 | New State: 3


     0    1    2    3    4    5
0  0.0  0.0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  0.0  0.0  0.0
5  0.0  0.0  0.0  0.0  0.0  0.0
Old State: 3 |

In [None]:
# Test Run for full training ( 2000 iterations)

initial_state = set_initial_state()
initial_action =  get_action(current_state=initial_state, reward_matrix=rewards)
q_table = train_agent(2000, rewards, gamma, False)

pd.DataFrame(q_table)

Training in progress ...
Training complete !


Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.0,142.222222,0.0
1,0.0,0.0,0.0,177.777778,0.0,277.777778
2,0.0,0.0,0.0,177.777778,0.0,0.0
3,0.0,222.222222,142.222222,0.0,142.222222,0.0
4,113.777778,0.0,0.0,177.777778,0.0,0.0
5,0.0,222.222222,0.0,0.0,0.0,0.0


In [None]:
def deploy_agent(init_state, q_table):
  print("Start: ", init_state)
  state=init_state
  steps=0
  while True:
    steps+=1
    action = np.argmax(q_table[state,:])
    print(action)
    state=action
    if action==5:
      print("Finished!")
      return steps

In [None]:
start_room = 2
steps = deploy_agent(start_room,q_table)
print("number of rooms/actions: ",steps)

Start:  2
3
1
5
Finished!
number of rooms/actions:  3
