In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from helpers.replay_buffer import ReplayBuffer
from helpers.chain_environment import SimpleChain
from helpers.shedules import LinearSchedule
from helpers.create_empty_directory import create_empty_directory
from helpers.plots import plot_q_func_and_visitations

In [3]:
from dqn import train

In [4]:
dim_range = np.arange(5, 30, 5).astype('int32')
seed_range =[10, 42, 51, 38, 50]  #np.array([10, 42, 51, 38, 50])
eps_params = {'exploration_fraction': 0.25,
              'exploration_final_eps': 0.001}


In [5]:
common_params = dict( gamma=0.99, write_logs=None, do_pretraining=True,
                     plot_freq=None, target_type='double_q_learning')

experiments = []

experiments.append({'name': 'eps_greedy',
                    'params': dict(eps_params=eps_params, act_type='epsilon_greedy', reward_shaping_type=None),
                    'iterate_seeds': True})
# ----------------------------------------------------------------------------- #
experiments.append({'name': 'ucb-1',
                    'params': dict(eps_params=None, act_type='ucb-1', reward_shaping_type=None),
                    'iterate_seeds': True})


experiments.append({'name': 'ucb-2',
                    'params': dict(eps_params=None, act_type='ucb-2', reward_shaping_type=None),
                    'iterate_seeds': True})
# ----------------------------------------------------------------------------- #
experiments.append({'name': 'count_based_state_action',
                    'params': dict(eps_params=None, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_state_action'),
                    'iterate_seeds': True})

experiments.append({'name': 'count_based_next_state_action',
                    'params': dict(eps_params=None, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_next_state_action'),
                    'iterate_seeds': True})

experiments.append({'name': 'count_based_state',
                    'params': dict(eps_params=None, act_type='epsilon_greedy',
                                       reward_shaping_type='count_based_state'),
                    'iterate_seeds': True})

experiments.append({'name': 'count_based_next_state',
                    'params': dict(eps_params=None, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_next_state'),
                    'iterate_seeds': True})
# ----------------------------------------------------------------------------- #
experiments.append({'name': 'eps_greedy_count_based_state_action',
                    'params': dict(eps_params=eps_params, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_state_action'),
                    'iterate_seeds': True})

experiments.append({'name': 'eps_greedy_count_based_next_state_action',
                    'params': dict(eps_params=eps_params, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_next_state_action'),
                    'iterate_seeds': True})

experiments.append({'name': 'eps_greedy_count_based_state',
                    'params': dict(eps_params=eps_params, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_state'),
                    'iterate_seeds': True})

experiments.append({'name': 'eps_greedy_count_based_next_state',
                    'params': dict(eps_params=eps_params, act_type='epsilon_greedy',
                                   reward_shaping_type='count_based_next_state'),
                    'iterate_seeds': True})


In [6]:
input_dim=10
chain_env=SimpleChain(input_dim)
num_actions = chain_env.action_space.n
dim_states = chain_env.observation_space.shape[0]

eps_params = {'exploration_fraction': 0.5,
              'exploration_final_eps': 0.05}

tau_params = {'fraction': 0.95,
              'final_tau': 0.05}

alpha_params = {'fraction': 0.95,
                'initial_alpha': 10,
                'final_alpha': 1}


In [7]:
%%time

folder = 'results/dqn/chain/'
create_empty_directory(folder)


for experiment in experiments:
    name = experiment['name']   
    print(name)
    results = np.zeros((len(seed_range), dim_range.shape[0]))
    
    for i, seed in enumerate(seed_range):
        for j, dim in enumerate(dim_range):
            env = SimpleChain(int(dim))
            _, num_episodes = train(env,
                                   seed=seed,
                                   learning_starts_in_steps=(dim+9)*3,
                                   max_steps=2000*(dim+9),
                                   train_freq_in_steps=10,
                                   update_freq_in_steps=60,
                                   **common_params, **experiment['params'])
            
            results[i][j] = num_episodes
    np.save(folder+name, results)

eps_greedy
ucb-1
ucb-2
count_based_state_action
count_based_next_state_action
count_based_state
count_based_next_state
eps_greedy_count_based_state_action
eps_greedy_count_based_next_state_action
eps_greedy_count_based_state
eps_greedy_count_based_next_state
CPU times: user 21h 27min 4s, sys: 4min 47s, total: 21h 31min 51s
Wall time: 7h 26min 48s


In [13]:
import os
def print_results(experiments, folder, to_print=True):
    all_stats = []
    for i, experiment in enumerate(experiments):
        name = experiment['name']
        
        arr = np.load(folder+name+'.npy')
        stats = np.zeros((3, arr.shape[1]))
        stats[0] = arr.min(axis=0)
        stats[1] = arr.mean(axis=0)
        stats[2] = arr.max(axis=0)
        all_stats.append(stats)
        if to_print:
            print(i, name)
            print(stats[:,:5])
            print('\n')
    return all_stats

In [14]:
all_stats = print_results(experiments, 'results/dqn/chain/')

0 eps_greedy
[[ 595.   592.   658.   627.  2000. ]
 [ 652.  1203.4  980.6 1725.4 2000. ]
 [ 722.  2000.  2000.  2000.  2000. ]]


1 ucb-1
[[ 109.   118.   132.   244.   211. ]
 [ 164.2  131.4  205.8 1207.2 1308.8]
 [ 370.   150.   335.  2000.  2000. ]]


2 ucb-2
[[ 106.   110.   115.   122.   199. ]
 [ 107.4  128.   119.6  532.2  983. ]
 [ 110.   149.   134.  2000.  2000. ]]


3 count_based_state_action
[[  99.    99.    99.    99.    99. ]
 [1239.6  860.   859.4  873.6  865.6]
 [2000.  2000.  2000.  2000.  2000. ]]


4 count_based_next_state_action
[[  99.    99.    99.    99.    99. ]
 [1239.6  860.   859.4 1619.8 1619.8]
 [2000.  2000.  2000.  2000.  2000. ]]


5 count_based_state
[[  99.    99.    99.    99.    99. ]
 [1239.6  860.   859.4  873.6  865.6]
 [2000.  2000.  2000.  2000.  2000. ]]


6 count_based_next_state
[[  99.    99.    99.    99.    99. ]
 [1239.6  860.   859.4  874.2  865.6]
 [2000.  2000.  2000.  2000.  2000. ]]


7 eps_greedy_count_based_state_action
[[ 594.   