# Basic Q-Learning task

In [2]:
# imports
import numpy as np
from helpers.map import basic_map
from helpers.get_available_actions import get_available_actions
from helpers.q_matrix import q_matrix
from helpers.r_matrix import r_matrix
from helpers.random_start import random_start
from helpers.states_and_actions import states
import pandas as pd
from IPython.display import display

## Initialise basic environment

In [16]:
def run_q_learning_basic(alpha: float, gamma: float, epsilon: float, num_episodes: int) -> pd.DataFrame:
    """Here we run an episode of Q-learning for the learner
    heavily influenced by Lab 4 code
    
    As it has a primary and secondary objective, I may need to stop it from repeatedly collecting the 1st reward"""
    base_map = basic_map()
    
    # States and actions
    S = states(base_map)
    
    goal_state = 10#66# 10 # TODO get programatically
    
    # R matrix
    rmat = r_matrix(base_map)
    
    # Q matrix
    Q = q_matrix(base_map)

    # Stats
    stats = {
        'num_steps': [],
        'stepPrimaryObj': [],
        'startingStep': [],
        'maxQ': [],
        'total_reward': [],
        'total_health': [],
        'q_var': []
    }
    
    # run for episodes
    for episode in range(num_episodes):
        s = random_start(base_map)
        #print(f"Starting state is {s}")
        hit_target = False
        stats['startingStep'].append(s)
        total_reward = 0
        total_health = 100
        for step in range(500):
            potential_actions = get_available_actions(rmat, s)
            
            # get the Q values for these
            q_values = [Q[s, a] for a in potential_actions]
            
            # get the best actions from the Q values
            best_actions = potential_actions[np.where(q_values == np.max(q_values))[0]]
            # get the best actions Q values
            # best_actions_q_values = [Q[s, x] for x in best_actions]
            
            # determine whether to explore or exploit
            if np.random.uniform() > epsilon:
                a = np.random.choice(potential_actions)
            else:
                a = np.random.choice(best_actions)
        
            # Get the reward
            reward = rmat[s, a]
            if reward == 50:
                if hit_target:
                    reward = 0
                else:
                    hit_target = True
                    stats['stepPrimaryObj'].append(step)
            elif reward == -10:
                # Check if it hit a trap
                total_health -= 10
            
            old_state = s
            s = a
            
            total_reward += reward

            # Update Q Value
            Q[old_state, a] = Q[old_state, a] + alpha * reward + gamma * (max(Q[s]) - Q[old_state, a])
            
            # check if goal reached
            if S[s] == goal_state or total_health <= 0:
                # print("Hit goal!")
                break
        # start filling in the statistics
        stats['num_steps'].append(step + 1)
        if not hit_target:
            stats['stepPrimaryObj'].append(-1)
        # cap the Q values statistics decimal places
        stats['maxQ'].append(Q.max().round(1))
        stats['total_reward'].append(total_reward)
        stats['total_health'].append(total_health)
        stats['q_var'].append(Q.var())
    #print(f"End of episode {episode} Q matrix:\n{Q.round(1)}")
    display(pd.DataFrame(Q.round(1)))
    # put the statistics using pandas
    display(pd.DataFrame.from_dict(stats))

    # return the statistics
    return [
        alpha, 
        gamma,
        epsilon, 
        np.array(stats['total_reward']).mean(),
        np.array(stats['total_health']).mean(),
        Q.var()
    ]
run_q_learning_basic(1, 0.8, 0.9, 1000)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,num_steps,stepPrimaryObj,startingStep,maxQ,total_reward,total_health,q_var
0,131,-1,43,30.0,-30.0,40,2.286028e-01
1,159,48,37,41.7,80.0,100,9.058320e-01
2,29,-1,61,41.7,30.0,100,1.059550e+00
3,74,-1,30,41.7,30.0,100,1.216474e+00
4,6,-1,16,41.7,30.0,100,1.363875e+00
...,...,...,...,...,...,...,...
995,500,13,16,19118.1,50.0,100,2.960506e+06
996,500,8,43,19159.7,50.0,100,2.971702e+06
997,500,7,40,19201.4,50.0,100,2.978810e+06
998,500,8,43,19243.1,50.0,100,2.985985e+06


[1, 0.8, 0.9, 39.66, 99.38, 3010350.9141643345]