# Simple test

In [1]:
from estimators.ccb import pdis_cressieread
from estimators.bandits import snips
from estimators.bandits import clopper_pearson

cb_est = snips.Estimator()
cb_int = clopper_pearson.Interval()

ccb_est = pdis_cressieread.Estimator()
ccb_int = pdis_cressieread.Interval()

In [2]:
ccb_est.add_example([0.1, 0.1, 0.1], [1,2,3], [0.5, 0.5, 0.5])
print(f'estimate: {ccb_est.get()}')

ccb_int.add_example([0.1, 0.1, 0.1], [1,2,3], [0.5, 0.5, 0.5])
print(f'interval: {ccb_int.get()}')

estimate: [1.0, 2.0, 3.0000000000000426]
interval: [[0, 1], [0, 1], [0, 1]]


In [3]:
                #action 0           #action 1
# slot 0           1                    0
# slot 1           0                    0.8
ccb_est = pdis_cressieread.Estimator()
epsilon = 0.1
for i in range(1000):
    ccb_est.add_example(p_preds = [1 - epsilon if i%2 else epsilon, 1], rs = [i % 2, 0.8 * (i % 2)], p_logs = [0.5, 1])
print(f'estimate: {ccb_est.get()}')

estimate: [0.900224424911165, 0.7201795399289322]


# Simulator

In [4]:
# a0, a1, a2
# s0, s1

# matrix of rewards r_ij (i is 0,1, j is 0,1,2)
# online policy: epsilon greedy 

import numpy as np
import random
import pandas as pd

class EpsilonGreedy:
    def __init__(self, epsilon):
        self.epsilon = epsilon

    def explore(self, n):
        return self.epsilon / n

    def exploit(self, n):
        return self.epsilon / n + 1 - self.epsilon

class CcbSimulation(pd.DataFrame):
    def __init__(self, n, rewards, policy):
        slots = []
        nactions = len(rewards[0])
        nslots = len(rewards)
        for i in range(n):
            actions = list(range(nactions))
            state = policy
            for j in range(nslots):
                pmf = [a[0] for a in state]
                chosen_idx = np.random.choice(range(len(pmf)), p=pmf)
                chosen = actions[chosen_idx]
                state = state[chosen_idx][1]
                r = int(random.random() < rewards[j][chosen])
                actions = sorted(set(actions) - {actions[chosen_idx]})
                slots.append({'session': i, 'slot': j, 'p_log': pmf[chosen_idx], 'r': r, 'chosen': chosen, 'chosen_idx': chosen_idx})
        super().__init__(slots)

    def predict(self, policy):
        p_pred = []
        for _, e in self.sessions.iterrows():
            state = policy
            for i in range(len(e['p_log'])):
                pmf = [a[0] for a in state]
                chosen_idx = e['chosen_idx'][i]
                p_pred.append(pmf[chosen_idx])
                state = state[chosen_idx][1]
        self['p_pred'] = p_pred

    def cfe(self, alpha=0.05):
        nslots = self['slot'].max() + 1
        cb_est = [snips.Estimator() for i in range(nslots)]
        cb_int = [clopper_pearson.Interval() for i in range(nslots)]

        ccb_est = pdis_cressieread.Estimator()
        ccb_int = pdis_cressieread.Interval()

        for _, e  in self.sessions.iterrows():
            for i in range(len(e['p_pred'])):
                cb_est[i].add_example(p_pred=e['p_pred'][i], r=e['r'][i], p_log=e['p_log'][i])
                cb_int[i].add_example(p_pred=e['p_pred'][i], r=e['r'][i], p_log=e['p_log'][i])

            ccb_int.add_example(p_preds=e['p_pred'], rs=e['r'], p_logs=e['p_log'])
            ccb_est.add_example(p_preds=e['p_pred'], rs=e['r'], p_logs=e['p_log'])

        cb_int_results = [i.get(alpha) for i in cb_int]
        ccb_int_result = ccb_int.get(alpha)

        ccb_est_result = ccb_est.get()
        result = pd.DataFrame([
                dict({'name': 'cb', 'metric': 'est'}, **{f'slot_{i}': cb_est[i].get() for i in range(nslots)}),
                dict({'name': 'cb', 'metric': 'lb'}, **{f'slot_{i}': cb_int_results[i][0] for i in range(nslots)}),
                dict({'name': 'cb', 'metric': 'ub'}, **{f'slot_{i}': cb_int_results[i][1] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'est'}, **{f'slot_{i}': ccb_est_result[i] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'lb'}, **{f'slot_{i}': ccb_int_result[i][0] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'ub'}, **{f'slot_{i}': ccb_int_result[i][1] for i in range(nslots)}),
        ])
        result['all_slots'] = result['slot_0'] + result['slot_1']
        return result.set_index(['name', 'metric'])

    @property
    def sessions(self):
        agg = {'p_log': list, 'r': list, 'chosen': list, 'chosen_idx': list}
        if 'p_pred' in self.columns:
            agg['p_pred'] = list
        return self.groupby('session').agg(agg)

def generate(n, r00, r01, r02, r10, r11, r12, epsilon):
    rewards = np.array([[r00, r01, r02],[r10, r11, r12]])
    return CcbSimulation(n, rewards, epsilon)

## No slot dependencies

In [5]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.4, 0.8, 0.2]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy = [
    (eg.explore(3), [               #0
        (eg.exploit(2), []),        #1
        (eg.explore(2), []),        #2
    ]),
    (eg.exploit(3), [               #1
        (eg.exploit(2), []),        #0
        (eg.explore(2), []),        #2
    ]),
    (eg.explore(3), [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

baseline_random = [
    (1/3, [
        (1/2, []),
        (1/2, []),
    ]),
    (1/3, [
        (1/2, []),
        (1/2, []),
    ]),
    (1/3, [
        (1/2, []),
        (1/2, []),
    ])
]

baseline_1 = [
    (1, [
        (1, []),
        (0, []),
    ]),
    (0, [
        (0, []),
        (0, []),
    ]),
    (0, [
        (0, []),
        (1, []),
    ])
]

In [6]:
n = 10000
sim_eg = CcbSimulation(n, rewards, epsilon_greedy)
sim_b1 = CcbSimulation(n, rewards, baseline_1)
sim_br = CcbSimulation(n, rewards, baseline_random)

In [7]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy({epsilon})', 'slot_0': sim_eg[sim_eg["slot"]==0]["r"].mean(), 'slot_1': sim_eg[sim_eg["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_b1["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

Unnamed: 0,policy,slot_0,slot_1,all_slots
0,epsilon-greedy(0.2),0.7231,0.4347,1.1578
1,baseline_1,0.3938,0.7995,1.1933
2,baseline_random,0.4735,0.4623,0.9358


In [8]:
sim_br.predict(epsilon_greedy)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.730676,0.621653,1.352329
cb,lb,0.720036,0.611251,1.331287
cb,ub,0.748207,0.636898,1.385105
ccb,est,0.73017,0.425539,1.155709
ccb,lb,0.714693,0.406913,1.121605
ccb,ub,0.742185,0.44514,1.187325


In [9]:
sim_br.predict(baseline_1)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.404058,0.794795,1.198854
cb,lb,0.389466,0.517453,0.906919
cb,ub,0.423099,0.54531,0.968409
ccb,est,0.404058,0.793594,1.197653
ccb,lb,0.387356,0.766814,1.15417
ccb,ub,0.421591,0.813138,1.234729


In [10]:
sim_br.predict(baseline_random)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.4735,0.4623,0.9358
cb,lb,0.463673,0.45249,0.916163
cb,ub,0.483343,0.472132,0.955474
ccb,est,0.4735,0.4623,0.9358
ccb,lb,0.463714,0.452528,0.916242
ccb,ub,0.483286,0.472072,0.955358


In [11]:
sim_eg.predict(epsilon_greedy)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.7231,0.4347,1.1578
cb,lb,0.714216,0.424954,1.13917
cb,ub,0.731854,0.444484,1.176338
ccb,est,0.7231,0.4347,1.1578
ccb,lb,0.71433,0.424984,1.139314
ccb,ub,0.73187,0.444416,1.176286


In [12]:
sim_eg.predict(baseline_1)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.397143,0.823764,1.220907
cb,lb,0.379261,0.108262,0.487523
cb,ub,0.455477,0.121544,0.577021
ccb,est,0.397143,0.813694,1.210837
ccb,lb,0.359082,0.776742,1.135824
ccb,ub,0.435204,0.845568,1.280771


In [13]:
sim_eg.predict(baseline_random)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.464995,0.341965,0.80696
cb,lb,0.453566,0.314592,0.768158
cb,ub,0.497813,0.356447,0.854261
ccb,est,0.469457,0.480807,0.950264
ccb,lb,0.452369,0.445497,0.897866
ccb,ub,0.486544,0.530558,1.017102


## With slot dependencies

In [78]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.2, 0.4, 0.8]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy = [
    (eg.explore(3), [               #0
        (eg.explore(2), []),        #1
        (eg.exploit(2), []),        #2
    ]),
    (eg.exploit(3), [               #1
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #2
    ]),
    (eg.explore(3), [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

In [79]:
n = 10000
sim_eg = CcbSimulation(n, rewards, epsilon_greedy)
sim_b1 = CcbSimulation(n, rewards, baseline_1)
sim_br = CcbSimulation(n, rewards, baseline_random)

In [80]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy({epsilon})', 'slot_0': sim_eg[sim_eg["slot"]==0]["r"].mean(), 'slot_1': sim_eg[sim_eg["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_b1["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

Unnamed: 0,policy,slot_0,slot_1,all_slots
0,epsilon-greedy(0.2),0.7373,0.72,1.4573
1,baseline_1,0.4002,0.3985,0.7987
2,baseline_random,0.4601,0.4713,0.9314


In [81]:
sim_br.predict(epsilon_greedy)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.725579,0.635528,1.361107
cb,lb,0.715363,0.624659,1.340023
cb,ub,0.743687,0.650116,1.393803
ccb,est,0.724979,0.728159,1.453138
ccb,lb,0.709898,0.70625,1.416148
ccb,ub,0.737062,0.743152,1.480214


In [82]:
sim_br.predict(baseline_1)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.38032,0.40643,0.78675
cb,lb,0.361502,0.255761,0.617264
cb,ub,0.394712,0.28051,0.675222
ccb,est,0.380807,0.401955,0.782761
ccb,lb,0.358487,0.363905,0.722392
ccb,ub,0.407851,0.446964,0.854815


In [83]:
sim_br.predict(baseline_random)
sim_br.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.4601,0.4713,0.9314
cb,lb,0.450294,0.461476,0.91177
cb,ub,0.469929,0.481141,0.95107
ccb,est,0.4601,0.4713,0.9314
ccb,lb,0.450331,0.461516,0.911847
ccb,ub,0.469869,0.481084,0.950953


In [84]:
sim_eg.predict(epsilon_greedy)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.7373,0.72,1.4573
cb,lb,0.728556,0.711087,1.439642
cb,ub,0.745906,0.728785,1.474691
ccb,est,0.7373,0.72,1.4573
ccb,lb,0.728674,0.7112,1.439874
ccb,ub,0.745926,0.7288,1.474726


In [85]:
sim_eg.predict(baseline_1)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.407307,0.352033,0.75934
cb,lb,0.413261,0.035696,0.448958
cb,ub,0.49017,0.063265,0.553435
ccb,est,0.407307,0.328947,0.736255
ccb,lb,0.368038,0.208528,0.576566
ccb,ub,0.446576,0.449367,0.895943


In [86]:
sim_eg.predict(baseline_random)
sim_eg.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.474866,0.493034,0.9679
cb,lb,0.455329,0.465431,0.92076
cb,ub,0.499583,0.509716,1.0093
ccb,est,0.475941,0.469529,0.94547
ccb,lb,0.459054,0.440879,0.899934
ccb,ub,0.496702,0.509045,1.005746


## Locked content

In [71]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.4, 0.8, 0.2]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy_lock0 = [
    (0, [               #0
        (eg.exploit(2), []),        #1
        (eg.explore(2), []),        #2
    ]),
    (1, [               #1
        (eg.exploit(2), []),        #0
        (eg.explore(2), []),        #2
    ]),
    (0, [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

baseline_random = [
    (1/3, [
        (1/2, []),
        (1/2, []),
    ]),
    (1/3, [
        (1/2, []),
        (1/2, []),
    ]),
    (1/3, [
        (1/2, []),
        (1/2, []),
    ])
]

baseline_1 = [
    (1, [
        (1, []),
        (0, []),
    ]),
    (0, [
        (0, []),
        (0, []),
    ]),
    (0, [
        (0, []),
        (1, []),
    ])
]

In [72]:
n = 10000
sim_eg_lock0 = CcbSimulation(n, rewards, epsilon_greedy_lock0)
sim_b1 = CcbSimulation(n, rewards, baseline_1)
sim_br = CcbSimulation(n, rewards, baseline_random)

In [74]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy_lock0({epsilon})', 'slot_0': sim_eg_lock0[sim_eg_lock0["slot"]==0]["r"].mean(), 'slot_1': sim_eg_lock0[sim_eg_lock0["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_b1["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

Unnamed: 0,policy,slot_0,slot_1,all_slots
0,epsilon-greedy_lock0(0.2),0.8044,0.3715,1.1759
1,baseline_1,0.4031,0.8059,1.209
2,baseline_random,0.4697,0.464,0.9337


In [75]:
sim_eg_lock0.predict(epsilon_greedy)
sim_eg_lock0.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.8044,0.3715,1.1759
cb,lb,0.796487,0.362018,1.158505
cb,ub,0.812135,0.381057,1.193192
ccb,est,0.8044,0.3715,1.1759
ccb,lb,0.796625,0.362029,1.158655
ccb,ub,0.812175,0.380971,1.193145


In [76]:
sim_eg_lock0.predict(baseline_1)
sim_eg_lock0.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.0,0.0,0.0
cb,lb,0.0,0.0,0.0
cb,ub,0.0,0.0,0.0
ccb,est,0.8044,0.3715,1.1759
ccb,lb,0.0,0.0,0.0
ccb,ub,1.0,1.0,2.0


In [77]:
sim_eg_lock0.predict(baseline_random)
sim_eg_lock0.cfe(alpha=0.05)

Unnamed: 0_level_0,Unnamed: 1_level_0,slot_0,slot_1,all_slots
name,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb,est,0.8044,0.29273,1.09713
cb,lb,0.263127,0.275783,0.538909
cb,ub,0.273185,0.316262,0.589447
ccb,est,0.8044,0.344975,1.149375
ccb,lb,0.265542,0.093847,0.359389
ccb,ub,0.937392,0.770385,1.707776
