In [2]:
import numpy as np
from collections import namedtuple

In [35]:
# - i: row index of agent
# - j: column index of agent
# - x: 1 if agent is carrying a block
# - i_distance: distance between the agents' i values
# - j_distance: distance between the agents' j values
# - a: 1 if pickup location 1 has blocks left
# - b: 1 if pickup location 2 has blocks left
# - c: 1 if dropoff location 1 has capacity left
# - d: 1 if dropoff location 2 has capacity left
# - e: 1 if dropoff location 3 has capacity left
# - f: 1 if dropoff location 4 has capacity left
State = namedtuple('State', 
                   ['i', 'j', 'x', 'i_distance', 'j_distance', 'a', 'b', 'c', 'd', 'e', 'f'])

#### PDWorld

In [4]:
class PDWorld:
    def __init__(self):
        self.size = 5
        self.alpha = 0.3,
        self.gamma = 0.5,
        self.pickup_locations = [(3,5), (4,2)]
        self.dropoff_locations = [(1,1), (1,5), (3,3), (5,5)]
        self.board = np.zeros((self.size, self.size))

    def setup(self, size: int, alpha: float, gamma: float, pickup_locations: list, dropoff_locations: list):
        if size:
            self.size = size
            self.board = np.zeros((size, size))
        if alpha:
            self.alpha = alpha
        if gamma:
            self.gamma = gamma
        if pickup_locations:
            self.pickup_locations = pickup_locations
        if dropoff_locations:
            self.dropoff_locations = dropoff_locations


#### Driver

In [5]:
# global PDWorld object
world = PDWorld()

In [1]:
# functions to run experiments
def experiment_1a():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PRANDOM')
    world.summary()

def experiment_1b():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PGREEDY')
    world.summary()

def experiment_1c():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PEXPLOIT')
    world.summary()
    world.display_q_table(agent='male')

def experiment_2():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM', method='SARSA')
    world.run(steps=7500, policy='PEXPLOIT', method='SARSA')
    world.summary()
    world.display_q_table(agent='male')

def experiment_3a():
    world.setup(size = 5,
                alpha = 0.15,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PEXPLOIT')
    world.summary()
    world.display_q_table(agent='male')

def experiment_3b():
    world.setup(size = 5,
                alpha = 0.45,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PEXPLOIT')
    world.summary()
    world.display_q_table(agent='male')

def experiment_4():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(total_runs=3, policy='PEXPLOIT', animate=True)
    world.summary()
    world.display_q_table(agent='male')
    
    world.setup(pickup_locations = [(1,2), (4,5)])
    world.run(total_runs=3, policy='PEXPLOIT', animate=True)
    world.summary()
    world.display_q_table(agent='male')


#### Q-Table

In [36]:
class QTable:
    def __init__(self):
        self.q_table = np.zeros((world.size**4 * 128, 6))
    
    def _encode_state(state: State) -> int:
        """Encodes the given state into its row index in the Q-table

        Parameters
        ----------
        state : State
            Named tuple containing state information
        
        Returns
        -------
        int
            integer index of state in Q-table
        """
        multipliers = np.array([world.size**3 * 128, world.size**2 * 128, world.size**2 * 64, world.size * 64, 64, 32, 16, 8, 4, 2, 1])
        return np.sum(np.multiply(np.array(state), multipliers))
    
    def _decode_index(index: int) -> State:
        """Decodes the given row index of the Q-table into a State named tuple

        Parameters
        ----------
        index : int
            Row index of the Q-table
        
        Returns
        -------
            Named tuple containing state information
        """
        state_values = []
        divisors = [world.size**3 * 128, world.size**2 * 128, world.size**2 * 64, world.size * 64, 64, 32, 16, 8, 4, 2, 1]
        remainder = index
        for divisor in divisors:
            state_values.append(int(np.floor(remainder / divisor)))
            remainder = remainder % divisor
        return State(*state_values)

    def next_operator(current_state: State, method: str = 'QL', policy: str ='PRANDOM') -> str:
        """Returns the next operator to be applied given the current state, method, and policy.\\
        The function assumes the operator is taken and updates the Q-table accordingly.

        Parameters
        ----------
        current_state: State
            Named tuple containg state information.
            - i: row index of agent
            - j: column index of agent
            - x: 1 if agent is carrying a block
            - i_distance: distance between the agents' i values
            - j_distance: distance between the agents' j values
            - a: 1 if pickup location 1 has blocks left
            - b: 1 if pickup location 2 has blocks left
            - c: 1 if dropoff location 1 has capacity left
            - d: 1 if dropoff location 2 has capacity left
            - e: 1 if dropoff location 3 has capacity left
            - f: 1 if dropoff location 4 has capacity left

        method: ['QL' | 'SARSA']
            Method used to update Q-table

        policy: ['PRANDOM' | 'PEXPLOIT' | 'PGREEDY']
            Policy to use when selecting next operator
        
        Returns
        -------
            String corresponding to operator to apply:
            ['n' | 's' | 'e' | 'w' | 'p' | 'd']
        """
        assert method in ['SARSA', 'QL'], 'Error: Invalid method'
        assert policy in ['PRANDOM', 'PEXPLOIT', 'PGREEDY'], 'Error: Invalid policy'

        pass

    def _next_operator(self, current_state: State, policy: str ='PRANDOM') -> int:
        """Returns the next operator to be applied given the current state, and policy.

        Parameters
        ----------
        current_state: State
            Named tuple containg state information.

        method: ['QL' | 'SARSA']
            Method used to update Q-table

        policy: ['PRANDOM' | 'PEXPLOIT' | 'PGREEDY']
            Policy to use when selecting next operator
        
        Returns
        -------
            Column index of q-table corresponding to the operator to take
            - 0: north
            - 1: south
            - 2: east
            - 3: west
            - 4: pick up
            - 5: drop off
        """
        applicable_operators = world.applicable_operators(current_state)
        if 4 in applicable_operators:
            return 4
        if 5 in applicable_operators:
            return 5
        else:
            max_val_operators = np.flatnonzero(self.q_table[self._encode_state(current_state)] == np.max(self.q_table[self._encode_state(current_state)]))
            if policy == 'PRANDOM':
                # select applicable operator randomly
                return np.random.choice(applicable_operators)
            elif policy == 'PEXPLOIT':
                if np.random.rand() < 0.8:
                    # select applicable operator with highest q-value
                    return np.random.choice(max_val_operators)
                else:
                    # select applicable operator randomly from operators without highest q-value
                    return np.random.choice(np.setdiff1d(applicable_operators, max_val_operators))
            elif policy == 'PGREEDY':
                # select applicable operator with highest q-value
                return np.random.choice(max_val_operators)

    def _update_q_table(self, previous_state: State, action: int, next_state: State, policy: str = 'PRANDOM', method: str = 'QL'):
        if method == 'SARSA':
            self.q_table[self._encode_state(previous_state), action] = (
                self.q_table[self._encode_state(previous_state), action] +
                world.alpha * (world.reward((previous_state.i, previous_state.j), action) +
                world.gamma * self.q_table[self._encode_state(next_state), self._next_operator(next_state, policy)] -
                self.q_table[self._encode_state(previous_state), action])
            )
        elif method == 'QL':
            self.q_table[self._encode_state(previous_state), action] = (
                (1 - world.alpha) * self.q_table[self._encode_state(previous_state), action] + 
                world.alpha * (world.reward((next_state.i, next_state.j), action) + 
                world.gamma * max(self.q_table[self._encode_state(next_state)][world.applicable_operators(next_state)]))
            )
