In [13]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d 

import numpy as np
import matplotlib as mpl
from matplotlib.pyplot import cm

import warnings
warnings.filterwarnings("ignore")

### Comment

As most runs achieve maximal reward ($R=1$) at the end, comparing that value is not useful. Therefore, we use the cumulative rewards instead. 

The hyperparameter search was conducted for interval size $k=10$. As a new word is added every $10$ epochs, after $1000$ epochs the data set contains all words. Note, that data size in the hyperparameter search refers to the number of examples the agent gets to see per epoch, and not the number of words it must learn. We trained for $1500$ epochs in total.

## Fixed lexicon

In [14]:
sum_rewards_fixed = []
min_rewards_fixed = []
param_list = []
rewards_fixed = []
for data_size in [100,1000]:
    for alpha in [5.]:
        for batch_size in [16, 32]:
            for learning_rate in [0.001,0.01,0.1]:
                for lexicon_init in [0.0001, 0.001,0.01,0.1]:
                    filename = ('fixed_lexicon/' + str(data_size) + 'datasize_' + str(batch_size) + 'batchsize_' + 
                                    str(alpha) + 'alpha_' + str(learning_rate) + 'lr_' + str(lexicon_init) + 'init/')
                    reward = np.load(filename + 'rewards_0.npy')
                    rewards_fixed.append(reward)
                    sum_rewards_fixed.append(np.sum(reward))
                    min_rewards_fixed.append(np.min(reward))
                    param_list.append([data_size, alpha, batch_size, learning_rate, lexicon_init])
                    # print(counter, param_list[-1])
                    # counter = counter + 1

In [15]:
max_ranking_fixed = np.argsort(-np.array(sum_rewards_fixed))

In [16]:
for i in max_ranking_fixed:
    print(param_list[i], sum_rewards_fixed[i])

[1000, 5.0, 32, 0.01, 0.001] 1495.2711
[1000, 5.0, 32, 0.1, 0.001] 1495.2278
[1000, 5.0, 32, 0.001, 0.0001] 1495.2207
[1000, 5.0, 16, 0.01, 0.0001] 1495.1543
[1000, 5.0, 16, 0.001, 0.0001] 1495.125
[1000, 5.0, 16, 0.01, 0.001] 1495.0343
[1000, 5.0, 32, 0.001, 0.001] 1494.9476
[1000, 5.0, 16, 0.1, 0.0001] 1494.7793
[1000, 5.0, 16, 0.1, 0.01] 1494.6965
[1000, 5.0, 32, 0.1, 0.01] 1494.5676
[1000, 5.0, 32, 0.1, 0.0001] 1494.5515
[1000, 5.0, 16, 0.01, 0.01] 1494.1775
[1000, 5.0, 32, 0.01, 0.01] 1494.123
[1000, 5.0, 16, 0.1, 0.001] 1494.1018
[1000, 5.0, 32, 0.01, 0.0001] 1493.9718
[1000, 5.0, 16, 0.001, 0.001] 1493.7268
[1000, 5.0, 16, 0.001, 0.01] 1490.9417
[1000, 5.0, 16, 0.1, 0.1] 1490.2542
[1000, 5.0, 32, 0.001, 0.01] 1487.7622
[1000, 5.0, 32, 0.1, 0.1] 1485.498
[1000, 5.0, 16, 0.01, 0.1] 1457.7783
[100, 5.0, 32, 0.1, 0.01] 1456.3645
[100, 5.0, 16, 0.1, 0.0001] 1452.448
[100, 5.0, 16, 0.01, 0.0001] 1451.875
[100, 5.0, 16, 0.001, 0.0001] 1451.6042
[100, 5.0, 16, 0.01, 0.01] 1450.5625
[100

### 10 best parameters

In [17]:
for idx in max_ranking_fixed[0:10]:
    print(param_list[idx], sum_rewards_fixed[idx], min_rewards_fixed[idx])

[1000, 5.0, 32, 0.01, 0.001] 1495.2711 0.859879
[1000, 5.0, 32, 0.1, 0.001] 1495.2278 0.8719758
[1000, 5.0, 32, 0.001, 0.0001] 1495.2207 0.88810486
[1000, 5.0, 16, 0.01, 0.0001] 1495.1543 0.9153226
[1000, 5.0, 16, 0.001, 0.0001] 1495.125 0.8921371
[1000, 5.0, 16, 0.01, 0.001] 1495.0343 0.8679435
[1000, 5.0, 32, 0.001, 0.001] 1494.9476 0.8155242
[1000, 5.0, 16, 0.1, 0.0001] 1494.7793 0.7429435
[1000, 5.0, 16, 0.1, 0.01] 1494.6965 0.922379
[1000, 5.0, 32, 0.1, 0.01] 1494.5676 0.83971775


## Dynamic lexicon

In [23]:
sum_rewards_dynamic = []
min_rewards_dynamic = []
param_list = []
rewards_dynamic = []
for data_size in [100,1000]:
    for alpha in [5.]:
        for batch_size in [16, 32]:
            for learning_rate in [0.001,0.01,0.1]:
                for lexicon_init in [0.0001,0.001,0.01,0.1]:
                    filename = ('dynamic_lexicon/' + str(data_size) + 'datasize_' + str(batch_size) + 'batchsize_' + 
                                        str(alpha) + 'alpha_' + str(learning_rate) + 'lr_' + str(lexicon_init) + 'init/')
                    reward = np.load(filename + 'rewards_0.npy')
                    rewards_dynamic.append(reward)
                    sum_rewards_dynamic.append(np.sum(reward))
                    min_rewards_dynamic.append(np.min(reward))
                    param_list.append([data_size, alpha, batch_size, learning_rate, lexicon_init])

In [24]:
max_ranking_dynamic = np.argsort(-np.array(sum_rewards_dynamic))
for i in max_ranking_dynamic:
    print(param_list[i], sum_rewards_dynamic[i])

[1000, 5.0, 16, 0.01, 0.1] 1499.0807
[1000, 5.0, 32, 0.1, 0.1] 1499.0272
[1000, 5.0, 16, 0.1, 0.1] 1498.7168
[1000, 5.0, 32, 0.01, 0.1] 1498.5203
[1000, 5.0, 16, 0.001, 0.01] 1498.0989
[1000, 5.0, 32, 0.001, 0.01] 1497.5999
[1000, 5.0, 16, 0.001, 0.1] 1495.6794
[1000, 5.0, 16, 0.01, 0.01] 1492.7087
[1000, 5.0, 32, 0.01, 0.01] 1492.121
[1000, 5.0, 32, 0.001, 0.1] 1487.5414
[100, 5.0, 32, 0.001, 0.01] 1484.7812
[100, 5.0, 16, 0.001, 0.01] 1480.7083
[100, 5.0, 32, 0.01, 0.1] 1474.052
[100, 5.0, 16, 0.01, 0.1] 1473.0625
[100, 5.0, 16, 0.1, 0.1] 1472.7916
[100, 5.0, 32, 0.1, 0.1] 1459.7916
[1000, 5.0, 16, 0.001, 0.001] 1393.864
[100, 5.0, 16, 0.001, 0.1] 1316.6875
[100, 5.0, 16, 0.01, 0.01] 1185.875
[1000, 5.0, 32, 0.001, 0.001] 1092.5876
[1000, 5.0, 16, 0.1, 0.01] 1022.1522
[100, 5.0, 32, 0.001, 0.1] 1010.8854
[1000, 5.0, 32, 0.1, 0.01] 965.03937
[100, 5.0, 32, 0.01, 0.01] 841.1458
[100, 5.0, 16, 0.1, 0.01] 668.82294
[1000, 5.0, 16, 0.01, 0.001] 583.63007
[100, 5.0, 16, 0.001, 0.001] 457.9

### 10 best parameters

In [25]:
for idx in max_ranking_dynamic[0:10]:
    print(param_list[idx], sum_rewards_dynamic[idx])

[1000, 5.0, 16, 0.01, 0.1] 1499.0807
[1000, 5.0, 32, 0.1, 0.1] 1499.0272
[1000, 5.0, 16, 0.1, 0.1] 1498.7168
[1000, 5.0, 32, 0.01, 0.1] 1498.5203
[1000, 5.0, 16, 0.001, 0.01] 1498.0989
[1000, 5.0, 32, 0.001, 0.01] 1497.5999
[1000, 5.0, 16, 0.001, 0.1] 1495.6794
[1000, 5.0, 16, 0.01, 0.01] 1492.7087
[1000, 5.0, 32, 0.01, 0.01] 1492.121
[1000, 5.0, 32, 0.001, 0.1] 1487.5414


# Selection

For the between lexicon comparison, we ignore the lexicon initializtion. The intial lexicon sizes, and therefore the number of parameters are so different between the implementations, that different initializations make sense. 

In [26]:
for idx in max_ranking_fixed[0:5]:
    print(param_list[idx], sum_rewards_fixed[idx])

[1000, 5.0, 32, 0.01, 0.001] 1495.2711
[1000, 5.0, 32, 0.1, 0.001] 1495.2278
[1000, 5.0, 32, 0.001, 0.0001] 1495.2207
[1000, 5.0, 16, 0.01, 0.0001] 1495.1543
[1000, 5.0, 16, 0.001, 0.0001] 1495.125


In [27]:
for idx in max_ranking_dynamic[0:5]:
    print(param_list[idx], sum_rewards_dynamic[idx])

[1000, 5.0, 16, 0.01, 0.1] 1499.0807
[1000, 5.0, 32, 0.1, 0.1] 1499.0272
[1000, 5.0, 16, 0.1, 0.1] 1498.7168
[1000, 5.0, 32, 0.01, 0.1] 1498.5203
[1000, 5.0, 16, 0.001, 0.01] 1498.0989


$\rightarrow$ The best hyperparameters across implementation are

* data size : 1000
* batch size: 32 
* learning rate: 0.1

$\rightarrow$ For the initialization we choose: 0.001 for the fixed lexicon, and 0.1 for the dynamic lexicon. 