# Solution to a queue problem using TD Learning

# 1 - Imports

In [31]:
%load_ext autoreload
%autoreload 2
import class_queue as cq
import numpy as np
from tqdm.notebook import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2 - Env creation

## 2.1 - Creation an print of the env properties

In [43]:
queue = cq.Queue(num_sinks=6, array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']])

In [42]:
print(f'Sink count: {queue.num_sinks} sinks')
print(f'Utilities: {queue.array_utilities}')
print(f'Sink avaiability: {queue.sinks_availability}\n')

print(f'Queue growth: {queue.queue_growth} per iterarion')
print(f'Max away: {queue.away_max_size}')
print(f'Num agents: {len(queue.agents)}')
print(f'Queue occupation status: {queue.get_occupation()}\n')

print(f'Time spent for each task (mean and sd): {queue.queue_times}')
print(f'Agents in queue: {queue.agents}')

Sink count: 6 sinks
Utilities: [['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']]
Sink avaiability: 001000

Queue growth: 10 per iterarion
Max away: 5
Num agents: 2
Queue occupation status: MEDIUM

Time spent for each task (mean and sd): {'soap': [10, 1], 'towel': [5, 0.5], 'washing': [3, 0.5]}
Agents in queue: [<class_queue_old_old.Queue_agent object at 0x7d980a52df60>, <class_queue_old_old.Queue_agent object at 0x7d980a52ff70>]


# Tests

In [44]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6, 
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [10, 1], 'towel': [5, .5], 'washing': [3, .5]},

    # Queue parameters
    queue_growth=10,
    away_max_size = 5,

    # Optimization parameters 
    mode='collectivism', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.05, collectivism_param_mult = 20, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.1, sarsa_gamma=0.1,

    # Policy parameters
    policy='random', # can be 'random' or 'e-soft'
    policy_epsilon = 0.5, # The lower, the greeder

    q_table = None
)

In [40]:
# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
num_agents = []
rewards = []

for i in tqdm(range(int(1e6)), delay=0, miniters=1, smoothing=0):
    reward = queue.one_iteration(optimize=True)

    num_agents.append(len(queue.agents))
    rewards.append(reward)

  0%|          | 0/1000000 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Debug

In [22]:
queue.reset_state()
for i in tqdm(range(int(100)), delay=1, miniters=1):
    agents_for_optimization = queue.one_iteration(optimize=False)
    for agent in agents_for_optimization:
        if (agent.state != 'done') and (agent.last_state!=None):
            if not ((queue.get_q_value_index(agent.state, agent.action) == agent.state_idx) and (queue.get_q_value_index(agent.last_state, agent.last_action) == agent.last_state_idx)):
                print('Erro')
        else:
            print(agent.last_state, agent.last_action, agent.state, agent.action)

  0%|          | 0/100 [00:00<?, ?it/s]

TypeError: 'float' object is not iterable