In [1]:
import numpy as np
from collections import namedtuple

In [2]:
State = namedtuple('State', 
                   ['my_i', 'my_j', 'x', 'other_i', 'other_j', 'a', 'b', 'c', 'd', 'e', 'f'])

### PDWorld

In [None]:
# Syed - appicable operators, apply,

class PDWorld:
    def __init__(self, size, alpha, gamma, pickup_capacity, dropoff_capacity, pickup_locations, dropoff_locations):

        """
        size = 5,
        alpha = 0.3,
        gamma = 0.5,
        pickup_capacity = 10,
        dropoff_capacity = 5,
        agent_start_locations = [(1,3), (5,3)],
        pickup_locations = [(3,5), (4,2)],
        dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
        """

        self.board = np.zeros((size,size))
        self.alpha = alpha
        self.gamma = gamma
        self.dropoff_capacity = dropoff_capacity
        self.female_agent = Agent(None, agent="F")  # ToDo: give qtable
        self.male_agent = Agent(None, agent="F")  # ToDo: give qtable
        self.iteration = 0
        self.turn = 0  # 0 if female agent's turn, 1 otherwise

        self.dropoff_locations = {
            (0,0): 0, 
            (0,4): 0, 
            (2,2): 0, 
            (4,4): 0
        }
        self.pickup_locations = {
            (2,4): pickup_capacity, 
            (3,1): pickup_capacity
        }
    
    def setup(self, size: int, pickup_locations: list = None, dropoff_locations: list = None):
        """
        Arguments:
        - size: size of square board
        - pickup_locations: list of tuples (i,j) of pickup locations
        - dropoff_locations: list of tuples (i,j) of dropoff locations
        """
        self.size = size
        self.board = np.zeros((size, size))

        if pickup_locations:
            self.pickup_locations.clear()
            for tup in pickup_locations:
                self.pickup_locations[tup] = 10
        
        if dropoff_locations:
            self.dropoff_locations.clear()
            for tup in dropoff_locations:
                self.dropoff_locations[tup] = 0
    
    def applicable_operators(self, agent):
        applicable_ops = ["n", "s", "e", "w"]  # "p" and "d" are appended if conditions are met

        if agent.i == self.board.shape[1] or agent.i_other_agent == agent.i + 1:
            applicable_ops.remove("s")
        
        if agent.i == 0 or agent.i_other_agent == agent.i - 1:
            applicable_ops.remove("n")

        if agent.j == self.board.shape[0] or agent.j_other_agent == agent.j + 1:
            applicable_ops.remove("e")

        if agent.j == 0 or agent.j_other_agent == agent.j - 1:
            applicable_ops.remove("w")
        
        for loc, block_count in self.dropoff_locations.items():
            if agent.carrying_block and block_count < self.dropoff_capacity and \
                agent.i == loc[0] and agent.j == loc[1]:
                applicable_ops.append("d")
    
        for loc, block_count in self.pickup_locations.items():
            if not agent.carrying_block and block_count > 0 and \
                agent.i == loc[0] and agent.j == loc[1]:
                applicable_ops.append("p")
    
    def apply_operator(self, agent, operator):
        assert operator in ["n", "s", "e", "w", "d", "p"], "Error: Unknown Operator"

        if operator == "n":
            agent.i -= 1
        elif operator == "s":
            agent.i += 1
        elif operator == "e":
            agent.j += 1
        elif operator == "w":
            agent.i -= 1
        elif operator == "d":
            agent.carrying_block = False
            loc_tup = (agent.i, agent.j)
            self.dropoff_locations[loc_tup] += 1
        elif operator == "p":
            agent.carrying_block = True
            loc_tup = (agent.i, agent.j)
            self.pickup_locations[loc_tup] -= 1
    
    def run(steps=500, policy='PRANDOM', method='SARSA'):
        pass
        
    def change_pickup_location(self, new_pickup_locations):
        """
        Arguments:
        - new_pickup_locations: list of tuples (i,j) of new pickup locations on board
        """
        self.pickup_locations.clear()
        for tup in new_pickup_locations:
                self.pickup_locations[tup] = 10

    def save_visual_midrun():
        pass


class Agent:
    def __init__(self, q_table_obj, agent="F"):
        self.identifier = agent
        self.carrying_block = False

        # Female agent starts at (1,3) in non-index position, male starts at (5,3)
        self.i = 0 if agent == "F" else 4
        self.j = 2 if agent == "F" else 2
        self.i_other_agent = 4 if agent == "F" else 0
        self.j_other_agent = 2 if agent == "F" else 2
    
    def update_position(self, new_i, new_j):
        self.i = new_i
        self.j = new_j
    
    def update_other_agent_position(self, new_i, new_j):
        self.i_other_agent = new_i
        self.j_other_agent = new_j


### Driver Code

In [None]:
# global PDWorld object
world = PDWorld()

In [None]:
# functions to run experiments
def experiment_1a():
    world.setup(size = 5,
                alpha = 0.3,
                gamma = 0.5,
                pickup_capacity = 10,
                dropoff_capacity = 5,
                agent_start_locations = [(1,3), (5,3)],
                pickup_locations = [(3,5), (4,2)],
                dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
    world.run(steps=500, policy='PRANDOM')
    world.run(steps=7500, policy='PRANDOM')
    world.summary()

# def experiment_1b():
#     world.setup(size = 5,
#                 alpha = 0.3,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM')
#     world.run(steps=7500, policy='PGREEDY')
#     world.summary()

# def experiment_1c():
#     world.setup(size = 5,
#                 alpha = 0.3,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM')
#     world.run(steps=7500, policy='PEXPLOIT')
#     world.summary()
#     world.display_q_table(agent='male')

# def experiment_2():
#     world.setup(size = 5,
#                 alpha = 0.3,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM', method='SARSA')
#     world.run(steps=7500, policy='PEXPLOIT', method='SARSA')
#     world.summary()
#     world.display_q_table(agent='male')

# def experiment_3a():
#     world.setup(size = 5,
#                 alpha = 0.15,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM')
#     world.run(steps=7500, policy='PEXPLOIT')
#     world.summary()
#     world.display_q_table(agent='male')

# def experiment_3b():
#     world.setup(size = 5,
#                 alpha = 0.45,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM')
#     world.run(steps=7500, policy='PEXPLOIT')
#     world.summary()
#     world.display_q_table(agent='male')

# def experiment_4():
#     world.setup(size = 5,
#                 alpha = 0.3,
#                 gamma = 0.5,
#                 pickup_capacity = 10,
#                 dropoff_capacity = 5,
#                 agent_start_locations = [(1,3), (5,3)],
#                 pickup_locations = [(3,5), (4,2)],
#                 dropoff_locations = [(1,1), (1,5), (3,3), (5,5)])
#     world.run(steps=500, policy='PRANDOM')
#     world.run(total_runs=3, policy='PEXPLOIT', animate=True)
#     world.summary()
#     world.display_q_table(agent='male')
    
#     world.setup(pickup_locations = [(1,2), (4,5)])
#     world.run(total_runs=3, policy='PEXPLOIT', animate=True)
#     world.summary()
#     world.display_q_table(agent='male')


### Q-Table

In [None]:
class QTable:
    def __init__(self):
        self.q_table = np.zeros((world.size**4 * 2**7 ,6))
    
    def _encode_state(state: State) -> int:
        """Encodes the given state into its row index in the Q-table

        Parameters
        ----------
        state : State
            Named tuple containing state information
        
        Returns
        -------
        int
            integer index of state in Q-table
        """
        return (
            state.my_i * world.size**3 * 2**7 +
            state.my_j * world.size**2 * 2**7 +
            state.x * world.size**2 * 2**6 +
            state.other_i * world.size * 2**6 +
            state.other_j * 2**6 +
            state.a * 2**5 +
            state.b * 2**4 +
            state.c * 2**3 +
            state.d * 2**2 +
            state.e * 2 +
            state.f)
    
    def next_operator(current_state: State, method: str = 'QL', policy: str ='PRANDOM'):
        applicable_operators = world.applicable_operators(current_state)

        pass

    def _update_q_table(self, current_state: State, action: int, next_state: State, method: str = 'QL'):
        if method == 'SARSA':
            self.q_table[self._encode_state(current_state), action] = (
                
            )
        else:
            self.q_table[self._encode_state(current_state), action] = (
                (1 - world.alpha) * self.q_table[self._encode_state(current_state), action] + 
                world.alpha * ((world.penalty if action < 4 else world.reward) + 
                world.gamma * self.q_table[self._encode_state(next_state), max(world.applicable_operators(next_state))])
            )
