# Basic Q-Learning task

In [1]:
# imports
import numpy as np
from helpers.map import basic_map
from helpers.get_available_actions import get_available_actions
from helpers.q_matrix import q_matrix
from helpers.r_matrix import r_matrix
from helpers.random_start import random_start
from helpers.states_and_actions import states

## Initialise basic environment

In [8]:
def run_q_learning_basic(alpha: float, gamma: float, epsilon: float, num_episodes: int):
    """Here we run an episode of Q-learning for the learner
    heavily influenced by Lab 4 code
    
    As it has a primary and secondary objective, I may need to stop it from repeatedly collecting the 1st reward"""
    base_map = basic_map()
    
    # States and actions
    S = states(base_map)
    
    goal_state = 10 # TODO get programatically
    
    # R matrix
    rmat = r_matrix(base_map)
    
    # Q matrix
    Q = q_matrix(base_map)
    
    # run for episodes
    for episode in range(num_episodes):
        s = random_start(base_map)
        print(f"Starting state is {s}")
        
        for step in range(500):
            potential_actions = get_available_actions(rmat, s)
            
            # get the Q values for these
            q_values = [Q[s, a] for a in potential_actions]
            
            # get the best actions from the Q values
            best_actions = potential_actions[np.where(q_values == np.max(q_values))[0]]
            # get the best actions Q values
            best_actions_q_values = [Q[s, x] for x in best_actions]
            
            # determine whether to explore or exploit
            if np.random.uniform() > epsilon:
                a = np.random.choice(potential_actions)
            else:
                a = np.random.choice(best_actions)
        
        # Get the reward
        reward = rmat[s, a]
        
        old_state = s
        s = a
        
        # Update Q Value
        Q[old_state, a] = Q[old_state, a] + alpha * rmat[old_state, a] + gamma * (max(Q[s]) - Q[old_state, a])
        
        # check if goal reached
        if S[s] == goal_state:
            break
    print(f"End of episode {episode} Q matrix:\n{Q.round(1)}")
run_q_learning_basic(1, 0.8, 0.9, 1000)

Starting state is 29
Starting state is 39
Starting state is 16
Starting state is 13
Starting state is 43
Starting state is 41
Starting state is 34
Starting state is 70
Starting state is 34
Starting state is 34
Starting state is 31
Starting state is 37
Starting state is 70
Starting state is 57
Starting state is 13
Starting state is 22
Starting state is 22
Starting state is 22
Starting state is 61
Starting state is 61
Starting state is 16
Starting state is 39
Starting state is 64
Starting state is 61
Starting state is 14
Starting state is 33
Starting state is 30
Starting state is 61
Starting state is 68
Starting state is 16
Starting state is 57
Starting state is 29
Starting state is 30
Starting state is 39
Starting state is 43
Starting state is 37
Starting state is 40
Starting state is 34
Starting state is 30
Starting state is 31
Starting state is 41
Starting state is 31
Starting state is 57
Starting state is 43
Starting state is 59
Starting state is 37
Starting state is 67
Starting stat