# Solution to a queue problem using TD Learning

# 1 - Imports

In [1]:
%load_ext autoreload
%autoreload 2
import class_queue as cq
import numpy as np
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go

# Experiments

In [269]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='egocentric', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.5, collectivism_param_mult = 20, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.6,

    # Policy parameters
    policy='e-soft', # can be 'random' or 'e-soft'
    policy_epsilon = 0.1, # The lower, the greeder

    q_table = None
)

In [270]:
# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
num_agents = []
rewards = []

for i in tqdm(range(int(1e5)), smoothing=0):
    reward = queue.one_iteration(optimize=True)

    num_agents.append(len(queue.agents))
    rewards.append(reward)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [271]:
fig = go.Figure()

# Add traces
# Agents
num_agents_np = np.array(num_agents)
window_width = 1000
cumsum_vec = np.cumsum(np.insert(num_agents_np, 0, 0)) 
num_agents_avg = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width

# fig.add_trace(go.Scatter(x=np.arange(len(num_agents)), y=np.array(num_agents),
#                     mode='lines',
#                     name='Num agents'))

fig.add_trace(go.Scatter(x=np.arange(window_width,window_width+len(num_agents_avg)), y=num_agents_avg,
                    mode='lines',
                    name='Num agents avg'))

# Rewards
rewards_np = np.array(rewards)
# window_width = 1000
cumsum_vec = np.cumsum(np.insert(rewards_np, 0, 0)) 
rewards_avg = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width

# fig.add_trace(go.Scatter(x=np.arange(len(rewards)), y=np.array(rewards),
#                     mode='lines',
#                     name='Rewards'))

fig.add_trace(go.Scatter(x=np.arange(window_width,window_width+len(rewards_avg)), y=rewards_avg,
                    mode='lines',
                    name='Rewards avg'))

fig.update_layout({
    'title': f'Average in last 1k steps:  Reward: {np.mean(rewards_np[-10000:]):.2f} | Num agents: {np.mean(num_agents_np[-10000:]):.2f}',
})

fig.show()


In [243]:
queue.q_table.iloc[queue.q_table['Q'].argmax()]

POS               5
NEEDS          soap
SINKS        001111
QUEUE          HIGH
ACTION         away
Q         140.01123
Name: 36280, dtype: object

In [229]:
np.corrcoef(rewards, num_agents)[1,0]

-0.24074408707472608

# Debug

In [7]:
queue.reset_state()
for i in tqdm(range(int(100)), delay=1, miniters=1):
    agents_for_optimization = queue.one_iteration(optimize=False)
    for agent in agents_for_optimization:
        if (agent.state != 'done') and (agent.last_state!=None):
            if not ((queue.get_q_value_index(agent.state, agent.action) == agent.state_idx) and (queue.get_q_value_index(agent.last_state, agent.last_action) == agent.last_state_idx)):
                print('Erro')
        else:
            print(agent.last_state, agent.last_action, agent.state, agent.action)

  0%|          | 0/100 [00:00<?, ?it/s]

TypeError: 'float' object is not iterable