In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import cufflinks as cf
import pandas as pd
import numpy as np
import script_RL

In [3]:
print (cf.__version__)
cf.set_config_file(offline=True)

0.17.0


In [4]:
N_RUN = 2000

## Run 10 armed bandits experience over 2000 runs and 1000 iterations with differents arguments

Purely greedy launch ($\epsilon=0$):

In [5]:
%%time
matrix_trajectories, matrix_choices, array_best_choice = script_RL.k_bandits_simulation(n_run=N_RUN, epsilon=0)

CPU times: user 1min 31s, sys: 737 ms, total: 1min 32s
Wall time: 1min 40s


Epsilon greedy ($\epsilon=0.1$)

In [6]:
%%time
matrix_trajectories2, matrix_choices2, array_best_choice2 = script_RL.k_bandits_simulation(n_run=N_RUN, epsilon=0.1)

CPU times: user 2min 54s, sys: 1.37 s, total: 2min 55s
Wall time: 3min 11s


Epsilon greedy ($\epsilon=0.01$)

In [7]:
%%time
matrix_trajectories3, matrix_choices3, array_best_choice3 = script_RL.k_bandits_simulation(n_run=N_RUN, epsilon=0.01)

CPU times: user 2min 53s, sys: 1.43 s, total: 2min 54s
Wall time: 3min 10s


In [8]:
df = pd.DataFrame(dtype=float)

df = df.assign(purely_greedy=np.mean(matrix_trajectories, axis=0),
               eps_greedy_01=np.mean(matrix_trajectories2, axis=0),
              eps_greedy_001=np.mean(matrix_trajectories3, axis=0))

df.iplot(title="10 Armed Bandits -- Average over 2000 runs",
         xTitle="iterations",
         yTitle="average reward",
         width=0.8)

del df

In [9]:
df = pd.DataFrame(dtype=float)


array_greedy = (matrix_choices == array_best_choice.reshape((-1,1))).sum(axis=0) / N_RUN
array_eps_greedy = (matrix_choices2 == array_best_choice2.reshape((-1,1))).sum(axis=0) / N_RUN
array_eps_greedy2 = (matrix_choices3 == array_best_choice3.reshape((-1,1))).sum(axis=0) / N_RUN


df = df.assign(purely_greedy=array_greedy,
               eps_greedy_01=array_eps_greedy,
               eps_greedy_001=array_eps_greedy2)

df.iplot(title="10 Armed bandits -- Average over 2000 runs",
         xTitle="iterations",
         yTitle="frequency of optimal action")

del df

Purely greedy ($\epsilon=0$) with value_optimist=5 and $\alpha=0.1$

In [10]:
%%time
matrix_trajectories4, matrix_choices4, array_best_choice4 = script_RL.k_bandits_simulation(n_run=N_RUN,alpha=0.1,value_optimist=5, epsilon=0)

CPU times: user 1min 31s, sys: 755 ms, total: 1min 32s
Wall time: 1min 36s


Epsilon greedy ($\epsilon=0$), with value_optimist=0 and $\alpha=0.1$

In [11]:
%%time
matrix_trajectories5, matrix_choices5, array_best_choice5 = script_RL.k_bandits_simulation(n_run=N_RUN,alpha=0.1, epsilon=0.1)

CPU times: user 2min 57s, sys: 1.58 s, total: 2min 58s
Wall time: 3min 6s


In [12]:
df = pd.DataFrame(dtype=float)


array_greedy_optimistic = (matrix_choices4 == array_best_choice4.reshape((-1,1))).sum(axis=0) / N_RUN
array_eps_greedy_alpha = (matrix_choices5 == array_best_choice5.reshape((-1,1))).sum(axis=0) / N_RUN


df = df.assign(purely_greedy_optimistic=array_greedy_optimistic,
               realistic_eps_greedy=array_eps_greedy_alpha)

df.iplot(title="10 armed bandits -- Average over 2000 runs -- alpha=0.1",
         xTitle="iterations",
         yTitle="Frequency of optimal action")

del df

In [13]:
%%time
matrix_trajectories6, matrix_choices6, array_best_choice6 = script_RL.k_bandits_simulation(n_run=N_RUN,epsilon=0,alpha=0.1,
                                                                                           ucb=True,c=2)

CPU times: user 2min 30s, sys: 1.95 s, total: 2min 32s
Wall time: 3min 3s


In [14]:
df = pd.DataFrame(dtype=float)


array_greedy_ucb = (matrix_choices6 == array_best_choice6.reshape((-1,1))).sum(axis=0) / N_RUN
array_eps_greedy_alpha = (matrix_choices5 == array_best_choice5.reshape((-1,1))).sum(axis=0) / N_RUN


df = df.assign(purely_greedy_ucb_c_2=array_greedy_optimistic,
               realistic_eps_greedy=array_eps_greedy_alpha)

df.iplot(title="10 Armed bandits -- Average over 2000 runs -- alpha=0.1",
         xTitle="iterations",
         yTitle="Frequency of optimal action")

del df

In [15]:
%%time
matrix_choices7, array_best_choice7 = script_RL.bandit_gradient(n_run=N_RUN, alpha=0.1, baseline_bool=True)

CPU times: user 6min 38s, sys: 6.54 s, total: 6min 45s
Wall time: 9min 9s


In [16]:
df = pd.DataFrame(dtype=float)


array_w_baseline_alpha_01 = (matrix_choices7 == array_best_choice7.reshape((-1,1))).sum(axis=0) / N_RUN
array_eps_greedy_alpha = (matrix_choices5 == array_best_choice5.reshape((-1,1))).sum(axis=0) / N_RUN

df = df.assign(bandit_g_w_baseline_alpha_01=array_w_baseline_alpha_01,
               realistic_eps_greedy=array_eps_greedy_alpha)

df.iplot(title="10 armed bandits -- average over 2000 runs -- alpha=0.1",xTitle="iterations",
         yTitle="frequency of optimal action")

del df

## Bonus: acendant  greedy with/without baseline & differents alpha 

In [17]:
%%time
matrix_choices8, array_best_choice8 = script_RL.bandit_gradient(n_run=N_RUN,alpha=0.4,baseline_bool=True)

CPU times: user 6min 19s, sys: 5.11 s, total: 6min 24s
Wall time: 7min 55s


In [18]:
%%time
matrix_choices9, array_best_choice9 = script_RL.bandit_gradient(n_run=N_RUN,alpha=0.1,baseline_bool=False)

CPU times: user 6min 24s, sys: 6.06 s, total: 6min 31s
Wall time: 8min 7s


In [19]:
%%time
matrix_choices10, array_best_choice10 = script_RL.bandit_gradient(n_run=N_RUN,alpha=0.4,baseline_bool=False)

CPU times: user 6min 3s, sys: 5.21 s, total: 6min 8s
Wall time: 7min 39s


In [20]:
df = pd.DataFrame(dtype=float)


array_w_baseline_alpha_01 = (matrix_choices6 == array_best_choice6.reshape((-1,1))).sum(axis=0) / N_RUN
array_w_baseline_alpha_04 = (matrix_choices7 == array_best_choice7.reshape((-1,1))).sum(axis=0) / N_RUN
array_wo_baseline_alpha_01 = (matrix_choices8 == array_best_choice8.reshape((-1,1))).sum(axis=0) / N_RUN
array_wo_baseline_alpha_04 = (matrix_choices9 == array_best_choice9.reshape((-1,1))).sum(axis=0) / N_RUN


df = df.assign(bandit_g_w_baseline_alpha_01=array_w_baseline_alpha_01,
               bandit_g_w_baseline_alpha_04=array_w_baseline_alpha_04,
               bandit_g_wo_baseline_alpha_01=array_wo_baseline_alpha_01,
               bandit_g_wo_baseline_alpha_04=array_wo_baseline_alpha_04)

df.iplot(xTitle="iterations", yTitle="frequency of optimal action")

del df