# Simple test

In [None]:
from estimators.ccb import pdis_cressieread
from estimators.bandits import snips
from estimators.bandits import clopper_pearson

cb_est = snips.Estimator()
cb_int = clopper_pearson.Interval()

ccb_est = pdis_cressieread.Estimator()
ccb_int = pdis_cressieread.Interval()

In [None]:
ccb_est.add_example([0.1, 0.1, 0.1], [1,2,3], [0.5, 0.5, 0.5])
print(f'estimate: {ccb_est.get()}')

ccb_int.add_example([0.1, 0.1, 0.1], [1,2,3], [0.5, 0.5, 0.5])
print(f'interval: {ccb_int.get()}')

In [None]:
                #action 0           #action 1
# slot 0           1                    0
# slot 1           0                    0.8
ccb_est = pdis_cressieread.Estimator()
epsilon = 0.1
for i in range(1000):
    ccb_est.add_example(
        p_preds = [1 - epsilon if i%2 else epsilon, 1],
        rs = [i % 2, 0.8 * (i % 2)],
        p_logs = [0.5, 1])
print(f'estimate: {ccb_est.get()}')

# Simulator

In [None]:
# a0, a1, a2
# s0, s1

# matrix of rewards r_ij (i is 0,1, j is 0,1,2)
# online policy: epsilon greedy 

import numpy as np
import random
import pandas as pd

class EpsilonGreedy:
    def __init__(self, epsilon):
        self.epsilon = epsilon

    def explore(self, n):
        return self.epsilon / n

    def exploit(self, n):
        return self.epsilon / n + 1 - self.epsilon

def baseline_random(n):
    return [(1/n, baseline_random(n-1)) for _ in range(n)]

def baseline_1(n, excl=[]):
    def _idx_2_id(i, excl):
        return sorted(set(range(n + len(excl))) - set(excl))[i]
    if len(excl) in excl:
        return [(0, baseline_1(n-1, excl + [_idx_2_id(i, excl)])) for i in range(n)]
    else:
        return [(int(_idx_2_id(i, excl) == len(excl)), baseline_1(n-1, excl + [_idx_2_id(i, excl)]))  for i in range(n)]

class Traffic:
    def __init__(self, rewards, policy, prob = 1):
        self.rewards = rewards
        self.policy = policy
        self.prob = prob

class CcbSimulation(pd.DataFrame):
    def __init__(self, n, traffic: list):
        slots = []
        traffic_pmf = [t.prob for t in traffic]
        for i in range(n):
            traffic_idx = np.random.choice(range(len(traffic_pmf)), p=traffic_pmf)
            rewards = traffic[traffic_idx].rewards
            nactions = len(rewards[0])
            nslots = len(rewards)
            actions = list(range(nactions))
            state = traffic[traffic_idx].policy
            for j in range(nslots):
                pmf = [a[0] for a in state]
                chosen_idx = np.random.choice(range(len(pmf)), p=pmf)
                chosen = actions[chosen_idx]
                state = state[chosen_idx][1]
                r = int(random.random() < rewards[j][chosen])
                actions = sorted(set(actions) - {actions[chosen_idx]})
                slots.append({'session': i, 'slot': j, 'p_log': pmf[chosen_idx], 'r': r, 'chosen': chosen, 'chosen_idx': chosen_idx})
        super().__init__(slots)

    def predict(self, policy):
        p_pred = []
        for _, e in self.sessions.iterrows():
            state = policy
            for i in range(len(e['p_log'])):
                pmf = [a[0] for a in state]
                chosen_idx = e['chosen_idx'][i]
                p_pred.append(pmf[chosen_idx])
                state = state[chosen_idx][1]
        self['p_pred'] = p_pred

    def cfe(self, alpha=0.05):
        nslots = self['slot'].max() + 1
        cb_est = [snips.Estimator() for i in range(nslots)]
        cb_int = [clopper_pearson.Interval() for i in range(nslots)]

        ccb_est = pdis_cressieread.Estimator()
        ccb_int = pdis_cressieread.Interval()

        for _, e  in self.sessions.iterrows():
            for i in range(len(e['p_pred'])):
                cb_est[i].add_example(p_pred=e['p_pred'][i], r=e['r'][i], p_log=e['p_log'][i])
                cb_int[i].add_example(p_pred=e['p_pred'][i], r=e['r'][i], p_log=e['p_log'][i])

            ccb_int.add_example(p_preds=e['p_pred'], rs=e['r'], p_logs=e['p_log'])
            ccb_est.add_example(p_preds=e['p_pred'], rs=e['r'], p_logs=e['p_log'])

        cb_int_results = [i.get(alpha) for i in cb_int]
        ccb_int_result = ccb_int.get(alpha)

        ccb_est_result = ccb_est.get()
        result = pd.DataFrame([
                dict({'name': 'cb', 'metric': 'est'}, **{f'slot_{i}': cb_est[i].get() for i in range(nslots)}),
                dict({'name': 'cb', 'metric': 'lb'}, **{f'slot_{i}': cb_int_results[i][0] for i in range(nslots)}),
                dict({'name': 'cb', 'metric': 'ub'}, **{f'slot_{i}': cb_int_results[i][1] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'est'}, **{f'slot_{i}': ccb_est_result[i] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'lb'}, **{f'slot_{i}': ccb_int_result[i][0] for i in range(nslots)}),
                dict({'name': 'ccb', 'metric': 'ub'}, **{f'slot_{i}': ccb_int_result[i][1] for i in range(nslots)}),
        ])
        result['all_slots'] = result['slot_0'] + result['slot_1']
        return result.set_index(['name', 'metric'])

    @property
    def sessions(self):
        agg = {'p_log': list, 'r': list, 'chosen': list, 'chosen_idx': list}
        if 'p_pred' in self.columns:
            agg['p_pred'] = list
        return self.groupby('session').agg(agg)

## No slot dependencies

In [None]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.4, 0.8, 0.2]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy = [
    (eg.explore(3), [               #0
        (eg.exploit(2), []),        #1
        (eg.explore(2), []),        #2
    ]),
    (eg.exploit(3), [               #1
        (eg.exploit(2), []),        #0
        (eg.explore(2), []),        #2
    ]),
    (eg.explore(3), [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

In [None]:
rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.4, 0.8, 0.2]])

n = 10000
sim_eg = CcbSimulation(n, [Traffic(rewards, epsilon_greedy)])
sim_b1 = CcbSimulation(n, [Traffic(rewards, baseline_1(3))])
sim_br = CcbSimulation(n, [Traffic(rewards, baseline_random(3))])

In [None]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy({epsilon})', 'slot_0': sim_eg[sim_eg["slot"]==0]["r"].mean(), 'slot_1': sim_eg[sim_eg["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_br["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_br["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

In [None]:
sim_br.predict(epsilon_greedy)
sim_br.cfe(alpha=0.05)

In [None]:
sim_br.predict(baseline_1(3))
sim_br.cfe(alpha=0.05)

In [None]:
sim_br.predict(baseline_random(3))
sim_br.cfe(alpha=0.05)

In [None]:
sim_eg.predict(epsilon_greedy)
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(baseline_1(3))
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(baseline_random(3))
sim_eg.cfe(alpha=0.05)

## With slot dependencies

In [None]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.2, 0.4, 0.8]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy = [
    (eg.explore(3), [               #0
        (eg.explore(2), []),        #1
        (eg.exploit(2), []),        #2
    ]),
    (eg.exploit(3), [               #1
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #2
    ]),
    (eg.explore(3), [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

In [None]:
n = 10000
sim_eg = CcbSimulation(n, [Traffic(rewards, epsilon_greedy)])
sim_b1 = CcbSimulation(n, [Traffic(rewards, baseline_1(3))])
sim_br = CcbSimulation(n, [Traffic(rewards, baseline_random(3))])

In [None]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy({epsilon})', 'slot_0': sim_eg[sim_eg["slot"]==0]["r"].mean(), 'slot_1': sim_eg[sim_eg["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_br["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_br["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

In [None]:
sim_br.predict(epsilon_greedy)
sim_br.cfe(alpha=0.05)

In [None]:
sim_br.predict(baseline_1(3))
sim_br.cfe(alpha=0.05)

In [None]:
sim_br.predict(baseline_random(3))
sim_br.cfe(alpha=0.05)

In [None]:
sim_eg.predict(epsilon_greedy)
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(baseline_1(3))
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(baseline_random(3))
sim_eg.cfe(alpha=0.05)

## Locked content

In [None]:
epsilon = 0.2

rewards = np.array([
    [0.4, 0.8, 0.2],
    [0.4, 0.8, 0.2]])

eg = EpsilonGreedy(epsilon)
epsilon_greedy_lock0 = [
    (0, [               #0
        (eg.exploit(2), []),        #1
        (eg.explore(2), []),        #2
    ]),
    (1, [               #1
        (eg.exploit(2), []),        #0
        (eg.explore(2), []),        #2
    ]),
    (0, [               #2
        (eg.explore(2), []),        #0
        (eg.exploit(2), []),        #1
    ])
]

baseline_random_lock0 = [
    (0, [
        (1/2, []),
        (1/2, []),
    ]),
    (1, [
        (1/2, []),
        (1/2, []),
    ]),
    (0, [
        (1/2, []),
        (1/2, []),
    ])
]

baseline_1_lock0 = [
    (0, [
        (0, []),
        (0, []),
    ]),
    (1, [
        (1, []),
        (0, []),
    ]),
    (0, [
        (1, []),
        (0, []),
    ])
]

In [None]:
n = 10000
sim_eg_lock0 = CcbSimulation(n, [Traffic(rewards, epsilon_greedy_lock0)])
sim_b1 = CcbSimulation(n, [Traffic(rewards, baseline_1_lock0)])
sim_br = CcbSimulation(n, [Traffic(rewards, baseline_random_lock0)])

In [None]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy_lock0({epsilon})', 'slot_0': sim_eg_lock0[sim_eg_lock0["slot"]==0]["r"].mean(), 'slot_1': sim_eg_lock0[sim_eg_lock0["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_br["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_br["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

In [None]:
sim_eg_lock0.predict(epsilon_greedy_lock0)
sim_eg_lock0.cfe(alpha=0.05)

In [None]:
sim_eg_lock0.predict(baseline_1_lock0)
sim_eg_lock0.cfe(alpha=0.05)

In [None]:
sim_eg_lock0.predict(baseline_random_lock0)
sim_eg_lock0.cfe(alpha=0.05)

## Variable slots count

In [None]:
epsilon = 0.2

rewards_1 = np.array([
    [0.8, 0.4]])

rewards_2 = np.array([
    [0.8, 0.4],
    [0.2, 0.8]])

eg = EpsilonGreedy(epsilon)

epsilon_greedy = [
    (eg.exploit(2), [
        (1, [])
    ]),
    (eg.explore(2), [
        (1, [])
    ])
]

In [None]:
n = 10000
sim_eg = CcbSimulation(n, [Traffic(rewards_1, epsilon_greedy, 0.5), Traffic(rewards_2, epsilon_greedy, 0.5)])
sim_b1 = CcbSimulation(n, [Traffic(rewards_1, baseline_1(2), 0.5), Traffic(rewards_2, baseline_1(2), 0.5)])
sim_br = CcbSimulation(n, [Traffic(rewards_1, baseline_random(2), 0.5), Traffic(rewards_2, baseline_random(2), 0.5)])

In [None]:
simulation_stats = pd.DataFrame([
    {'policy': f'epsilon-greedy({epsilon})', 'slot_0': sim_eg[sim_eg["slot"]==0]["r"].mean(), 'slot_1': sim_eg[sim_eg["slot"]==1]["r"].mean()},
    {'policy': f'baseline_1', 'slot_0': sim_b1[sim_b1["slot"]==0]["r"].mean(), 'slot_1': sim_b1[sim_b1["slot"]==1]["r"].mean()},
    {'policy': f'baseline_random', 'slot_0': sim_br[sim_br["slot"]==0]["r"].mean(), 'slot_1': sim_br[sim_br["slot"]==1]["r"].mean()},
])
simulation_stats['all_slots'] = simulation_stats['slot_0'] + simulation_stats['slot_1']
simulation_stats

In [None]:
sim_eg.predict(baseline_random(2))
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(baseline_1(2))
sim_eg.cfe(alpha=0.05)

In [None]:
sim_eg.predict(epsilon_greedy)
sim_eg.cfe(alpha=0.05)