# Solution to a queue problem using TD Learning and NN

To do:
- Create table to iniciate network
    - Define new state
    - get_new_q_table
- Value update
    - multi_agents_SARSA_step
- Action selection
    - random
    - e-soft
    - greedy

# 1 - Imports

In [83]:
%load_ext autoreload
%autoreload 2
import class_queue_nn as cq
import numpy as np
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
import torch.nn.init as init
import copy
# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2 - Experiments

## 2.1 - Collectivism

### 2.1.1 - $\varepsilon$-soft Policy

In [None]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='collectivism', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.8, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.9, # sarsa_alpha is the learning rate, sarsa_gamma is the importance of the future

    # Policy parameters
    policy_epsilon = 0.4, # The lower, the greeder

    q_table = None
)

# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
results_e_soft_collectvism = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(3*1e4)), smoothing=0):
    reward = queue.one_iteration(optimize=True, policy='e-soft') # Policy can be 'random' or 'e-soft'

    results_e_soft_collectvism['num_agents'].append(len(queue.agents))
    results_e_soft_collectvism['rewards'].append(reward)

In [None]:
# Plots
cq.plot_agents_and_rewards(title='e-soft Policy with Collectivism Reward', window_size=1000, agents_and_rewards_dict=results_e_soft_collectvism, mean_of_all=False)

In [None]:
# Greedy results after optimization
# Reset state
queue.reset_state()

# Store results
results_e_soft_collectvism_greedy = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e3)), smoothing=0):
    reward = queue.one_iteration(optimize=False, policy='greedy')

    results_e_soft_collectvism_greedy['num_agents'].append(len(queue.agents))
    results_e_soft_collectvism_greedy['rewards'].append(reward)

cq.plot_agents_and_rewards(title='Greedy Policy with Trained with e-soft policy and Collectivism Reward', window_size=100, agents_and_rewards_dict=results_e_soft_collectvism_greedy, mean_of_all=True)


### 2.1.2 - Random Policy

In [None]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='collectivism', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.8, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 1,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.6,

    # Policy parameters
    policy_epsilon = 0.1, # The lower, the greeder

    q_table = None
)

# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
results_random_collectvism = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e5)), smoothing=0):
    reward = queue.one_iteration(optimize=False, policy='random') # Policy can be 'random' or 'e-soft'

    results_random_collectvism['num_agents'].append(len(queue.agents))
    results_random_collectvism['rewards'].append(reward)

In [None]:
# Plots
cq.plot_agents_and_rewards(title='Random Policy with Collectivism Reward', window_size=10000, agents_and_rewards_dict=results_random_collectvism, mean_of_all=True)

## 2.2 - Egocentric

### 2.2.1 - $\varepsilon$-soft Policy

In [None]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='egocentric', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.5, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.9, # sarsa_alpha is the learning rate, sarsa_gamma is the importance of the future

    # Policy parameters
    policy_epsilon = 0.4, # The lower, the greeder

    q_table = None
)

# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
results_e_soft_egocentric = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e4)), smoothing=0):
    reward = queue.one_iteration(optimize=True, policy='e-soft') # Policy can be 'random' or 'e-soft'

    results_e_soft_egocentric['num_agents'].append(len(queue.agents))
    results_e_soft_egocentric['rewards'].append(reward)

In [None]:
# Plots
cq.plot_agents_and_rewards(title='e-soft Policy with Egocentric Reward', window_size=1000, agents_and_rewards_dict=results_e_soft_egocentric, mean_of_all=False)

In [None]:
# Greedy results after optimization
# Reset state
queue.reset_state()

# Store results
results_e_soft_egocentric_greedy = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e3)), smoothing=0):
    reward = queue.one_iteration(optimize=False, policy='greedy')

    results_e_soft_egocentric_greedy['num_agents'].append(len(queue.agents))
    results_e_soft_egocentric_greedy['rewards'].append(reward)

cq.plot_agents_and_rewards(title='Greedy Policy with Trained with e-soft policy and Egocentric Reward', window_size=100, agents_and_rewards_dict=results_e_soft_egocentric_greedy, mean_of_all=True)


### 2.2.2 - Random Policy

In [None]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='egocentric', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.5, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.6,

    # Policy parameters
    policy_epsilon = 0.1, # The lower, the greeder

    q_table = None
)

# Reset table
queue.q_table = queue.get_new_q_table()
queue.reset_state()

# Store results
results_random_egocentric = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e4)), smoothing=0):
    reward = queue.one_iteration(optimize=False, policy='random') # Policy can be 'random' or 'e-soft'

    results_random_egocentric['num_agents'].append(len(queue.agents))
    results_random_egocentric['rewards'].append(reward)

In [None]:
# Plots
cq.plot_agents_and_rewards(title='Random Policy with Egocentric Reward', window_size=1000, agents_and_rewards_dict=results_random_egocentric, mean_of_all=True)

## Sink Layout Experiments

In [None]:
for soap_bottle_position in range(6):
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']]
    array_utilities[soap_bottle_position].append('soap')
    queue = cq.Queue(
        # Sinks parameters
        num_sinks=6,
        array_utilities=array_utilities, 
        queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
        
        # Queue parameters
        queue_growth = 3, # Number of iterations until a new agent is added to the queue
        away_max_size = 10, # Max number of agents waiting

        # Optimization parameters 
        mode='collectivism', # mode can be 'collectivism' or 'egocentric'
        collectivism_param_decay = 0.8, collectivism_param_reward_scaling = 1, 
        egocentric_penalty = -1, egocentric_terminal_reward = 20,

        # SARSA parameters
        sarsa_alpha=0.3, sarsa_gamma=0.9, # sarsa_alpha is the learning rate, sarsa_gamma is the importance of the future

        # Policy parameters
        policy_epsilon = 0.4, # The lower, the greeder

        q_table = None
    )

    # Reset table
    queue.q_table = queue.get_new_q_table()
    queue.reset_state()

    # Store results
    results_e_soft_collectvism = {
        f'num_agents_{soap_bottle_position}': [],
        f'rewards_{soap_bottle_position}': []
    }
    for i in tqdm(range(int(3*1e4)), smoothing=0):
        reward = queue.one_iteration(optimize=True, policy='e-soft') # Policy can be 'random' or 'e-soft'

        results_e_soft_collectvism[f'num_agents_{soap_bottle_position}'].append(len(queue.agents))
        results_e_soft_collectvism[f'rewards_{soap_bottle_position}'].append(reward)

    # Plots
    plot_dict = {
    'num_agents': results_e_soft_collectvism[f'num_agents_{soap_bottle_position}'],
    'rewards': results_e_soft_collectvism[f'rewards_{soap_bottle_position}']
    }
    cq.plot_agents_and_rewards(title=f'e-soft Policy with Collectivism Reward - Bottle position: {soap_bottle_position}', window_size=1000, agents_and_rewards_dict=plot_dict, mean_of_all=False)

    # Greedy results after optimization
    # Reset state
    queue.reset_state()

    # Store results
    results_e_soft_collectvism_greedy = {
        f'num_agents_{soap_bottle_position}': [],
        f'rewards_{soap_bottle_position}': []
    }
    for i in tqdm(range(int(1e3)), smoothing=0):
        reward = queue.one_iteration(optimize=False, policy='greedy')

        results_e_soft_collectvism_greedy[f'num_agents_{soap_bottle_position}'].append(len(queue.agents))
        results_e_soft_collectvism_greedy[f'rewards_{soap_bottle_position}'].append(reward)


    plot_dict = {
    'num_agents': results_e_soft_collectvism_greedy[f'num_agents_{soap_bottle_position}'],
    'rewards': results_e_soft_collectvism_greedy[f'rewards_{soap_bottle_position}']
    }
    cq.plot_agents_and_rewards(title=f'Greedy Policy with Trained with e-soft policy and Collectivism Reward - Bottle Position: {soap_bottle_position}', window_size=100, agents_and_rewards_dict=plot_dict, mean_of_all=True)

In [None]:
# Data from the graphs
results_greedy_soap_bottle = np.array([
    2.87,
    2.91,
    2.84,
    2.96,
    2.82,
    3.00])

fig = go.Figure(layout_yaxis_range=[2.80,3.05])
fig.add_trace(go.Bar(x=np.arange(1,7), y=results_greedy_soap_bottle,
                        name='Num agents avg'))
fig.update_layout({
        'title': f'Average reward of diffent soap bottom positions',
    })
fig.show()

# Debug

In [84]:
[['towel', 'washing'], ['soap', 'washing', 'towel'], ['soap'], ['soap', 'washing', 'towel'], ['washing', 'towel'], ['soap', 'washing', 'towel']]

[['towel', 'washing'],
 ['soap', 'washing', 'towel'],
 ['soap'],
 ['soap', 'washing', 'towel'],
 ['washing', 'towel'],
 ['soap', 'washing', 'towel']]

In [90]:
queue = cq.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='collectivism', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.8, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.001, # sarsa_alpha is the learning rate, should be small enough to converge
    sarsa_gamma=0.9, # sarsa_gamma is the importance of the future, should be close to 1

    # Policy parameters
    policy_epsilon = 1, # The lower, the greeder. The higher, the more exploratory
    policy_epsilon_decay = (0.3**(1/20000)), # Decay of epsilon, a start value of 1 will make epsilon 0.3 after 20000 optimization steps

    q_nn = None,
    n_neurons = 256
)

# Reset table
# queue.q_nn = queue.get_new_q_nn(n_neurons=1024).to(device)
queue.reset_state()

# Store results
results_random_egocentric = {
    'num_agents': [],
    'rewards': []
}

In [88]:
import class_queue as cq_table
queue_table = cq_table.Queue(
    # Sinks parameters
    num_sinks=6,
    array_utilities=[['soap', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['soap', 'washing'], ['towel', 'washing'], ['washing']], 
    queue_times={'soap': [5, .5], 'washing': [3, .5], 'towel': [2, .5]},
    
    # Queue parameters
    queue_growth = 3, # Number of iterations until a new agent is added to the queue
    away_max_size = 10, # Max number of agents waiting

    # Optimization parameters 
    mode='collectivism', # mode can be 'collectivism' or 'egocentric'
    collectivism_param_decay = 0.8, collectivism_param_reward_scaling = 1, 
    egocentric_penalty = -1, egocentric_terminal_reward = 20,

    # SARSA parameters
    sarsa_alpha=0.3, sarsa_gamma=0.9, # sarsa_alpha is the learning rate, sarsa_gamma is the importance of the future

    # Policy parameters
    policy_epsilon = 0.4, # The lower, the greeder

    q_table = None
)

# Reset table
queue_table.q_table = queue_table.get_new_q_table()
queue_table.reset_state()
q_table = queue_table.q_table


# All state and actions from q_table
state_and_actions = []
for i in range(len(q_table)):
    line = q_table.iloc[i]
    state = [line['POS'], line['NEEDS'], line['SINKS'], line['QUEUE']]
    action = line['ACTION']
    state_and_actions.append((state, action))

# Run state and action on q_nn
outputs = []
for state, action in tqdm(state_and_actions):
    input = cq.state_and_action_to_network_input(state, action)
    input = torch.tensor(input, dtype=torch.float).to(device)
    output = queue.q_nn(input).detach().cpu().numpy()
    outputs.append(output)
outputs = np.array(outputs)

print(f'Ouputs mean: {outputs.mean()}, Outputs var: {outputs.var()}')

  0%|          | 0/37632 [00:00<?, ?it/s]

Ouputs mean: 0.10694900900125504, Outputs var: 0.023015247657895088


In [91]:
for i in tqdm(range(int(3*1e4)), smoothing=0):
    reward = queue.one_iteration(optimize=True, policy='e-soft') # Policy can be 'random' or 'e-soft'

    results_random_egocentric['num_agents'].append(len(queue.agents))
    results_random_egocentric['rewards'].append(reward)

    if (i+1) % 1000 == 0:
        print(i+1, np.array(results_random_egocentric['num_agents'][-1000:]).mean(), queue.policy_epsilon)
    #     queue.reset_state()
    #     greedy_reward_avg = queue.one_iteration(optimize=False, policy='greedy')
    #     for j in range(1000+1):
    #         greedy_reward = queue.one_iteration(optimize=False, policy='greedy')
    #         greedy_reward_avg += greedy_reward
    #     print(f'Greedy reward after {i} iterations: {greedy_reward_avg/1000:.2f} | epsilon: {queue.policy_epsilon:.4f}')

  0%|          | 0/30000 [00:00<?, ?it/s]

1000 8.198 0.3
2000 9.237 0.3
3000 9.139 0.3
4000 9.038 0.3
5000 9.128 0.3
6000 9.228 0.3
7000 9.018 0.3
8000 9.152 0.3
9000 9.065 0.3
10000 8.931 0.3
11000 9.226 0.3
12000 9.162 0.3
13000 9.127 0.3
14000 9.185 0.3
15000 9.112 0.3
16000 9.061 0.3
17000 9.096 0.3
18000 9.156 0.3
19000 9.012 0.3
20000 8.99 0.3
21000 9.127 0.3
22000 9.144 0.3
23000 9.114 0.3
24000 9.05 0.3
25000 9.173 0.3
26000 8.965 0.3
27000 9.084 0.3


In [12]:
# Plots
cq.plot_agents_and_rewards(title='Random Policy with Egocentric Reward', window_size=1000, agents_and_rewards_dict=results_random_egocentric, mean_of_all=True)

Correlation between number of agents and reward: -0.86


In [13]:
# Greedy results after optimization
# Reset state
queue.reset_state()

# Store results
results_e_soft_collectvism_greedy = {
    'num_agents': [],
    'rewards': []
}
for i in tqdm(range(int(1e3)), smoothing=0):
    reward = queue.one_iteration(optimize=False, policy='greedy')

    results_e_soft_collectvism_greedy['num_agents'].append(len(queue.agents))
    results_e_soft_collectvism_greedy['rewards'].append(reward)

cq.plot_agents_and_rewards(title='Greedy Policy with Trained with e-soft policy and Collectivism Reward', window_size=100, agents_and_rewards_dict=results_e_soft_collectvism_greedy, mean_of_all=True)


  0%|          | 0/1000 [00:00<?, ?it/s]

Correlation between number of agents and reward: -0.67
