In [1]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [3]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


330.8388783931732

In [4]:
# baseline regret
print(f"Baseline Solution...\nTotal regret: {output['regret']}\nMean regret: {output['regret']/output['rounds']}\nTotal banners: {output['total_banners']}")

Baseline Solution...
Total regret: 1540.7609683932544
Mean regret: 0.007703804841966272
Total banners: 184


In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


Реализуем алгоритм `Thompson Sampling` 

In [6]:
class ThompsonBanditArm:
    # Задаем априорное распределение, по умолчанию ~ Beta(1, 1)
    def __init__(self, arm_id, alpha_0, beta_0, prev_click, prev_impr):
        self.alpha = alpha_0
        self.beta = beta_0
        self.id = arm_id
        self.prev_click = prev_click
        self.prev_impr = prev_impr
        
    def add_success(self, n_times):
        self.alpha += n_times
    
    def add_failure(self, n_times):
        self.beta += n_times
        
    def get_sample(self):
        return np.random.beta(self.alpha, self.beta)

In [7]:
class MultiArmedBanditThompsonSampling:
    def __init__(self, alpha_0=1, beta_0=1):
        self.arms_by_id = {}
        self.alpha = alpha_0
        self.beta = beta_0
        
    def __call__(self, history):
        # Инициализируем состояние
        self.update_current_state(history)
        # Находим лучшую ручку, за которую можем дернуть, из сэмплов
        winner_arm_id = self.get_best_arm_from_samples(history)
        # Обновляем распределения ручек на основе посчитанного reward - кликов
        self.update_arms_reward(history)
        return winner_arm_id
    
    def get_best_arm_from_samples(self, history):
        max_sample = -1
        max_arm_id = -1
        
        for i in self.arms_by_id.keys():
            sample = self.arms_by_id[i].get_sample()
            if sample > max_sample:
                max_sample = sample
                max_arm_id = i
                
        return max_arm_id

    def update_arms_reward(self, history):
        for i in history.index:
            n = history.loc[i, :].clicks - self.arms_by_id[i].prev_click
            k = history.loc[i, :].impressions - self.arms_by_id[i].prev_impr
            if n > 0:
                self.arms_by_id[i].add_success(n)
            elif k > 0:
                self.arms_by_id[i].add_failure(k)
            self.arms_by_id[i].prev_impr = history.loc[i, :].impressions
            self.arms_by_id[i].prev_click = history.loc[i, :].clicks 

    def update_current_state(self, history):        
        history_ids = set(list(history.index))
        curr_ids = set(self.arms_by_id.keys())
        all_ids = set(history_ids | curr_ids)
        
        for i in all_ids:
            if i not in history_ids:
                del self.arms_by_id[i]
            if i not in curr_ids:
                cur_click = history.loc[i, :].clicks
                cur_impr = history.loc[i, :].impressions
                arm = ThompsonBanditArm(i, self.alpha, self.beta, cur_click, cur_impr)
                self.arms_by_id[i] = arm

In [8]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

thompson_sampling_policy = MultiArmedBanditThompsonSampling()

start = time.time()
output = simulation(thompson_sampling_policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


705.8096575737

In [9]:
print(f"Thompson Sampling MultiArmed Bandit Solution...\nTotal regret: {output['regret']}\nMean regret: {output['regret']/output['rounds']}\nTotal banners: {output['total_banners']}")

Thompson Sampling MultiArmed Bandit Solution...
Total regret: 1199.4502123393124
Mean regret: 0.005997251061696562
Total banners: 184


In [10]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,3334.0,685.0,18003.025431,0.220134
162,131.0,14.0,1537.166719,0.11378
172,17538.0,3805.0,19648.592394,0.219968
173,37.0,0.0,12771.47499,0.122694
180,23.0,0.0,4655.819793,0.020061
182,26.0,1.0,889.624649,0.004621
183,36.0,3.0,15187.163761,0.073886


Как видим, используя многорукого бандита на основе `Thompson sampling` нам удалось побить `e-greedy` бандита и получить меньший regard

Также в поисках наименьшего regard попробуем задавать разные априорные распределения - точнее будем менять параметры $\alpha$ и $\beta$ в ~ $Beta(\alpha, \beta)$

In [11]:
alpha = []
beta = []
total_regret = []
mean_regret = []
total_banners = []
times = []

In [12]:
alpha.append(1)
beta.append(1)
total_regret.append(output['regret'])
mean_regret.append(output['regret']/output['rounds'])
total_banners.append(output['total_banners'])
times.append(end - start)

In [13]:
for a in [1, 2]:
    for b in [1, 10, 20, 50]:
        if a == 1 and b == 1:
            continue
        print(f'alpha : {a}, beta {b}')
        alpha.append(a)
        beta.append(b)
        
        # seed for homework
        seed = 18475
        np.random.seed(seed=seed)
        thompson_sampling_policy = MultiArmedBanditThompsonSampling(a, b)

        start = time.time()
        output = simulation(thompson_sampling_policy, n=200000, seed=seed)
        end = time.time()
        end - start
        
        total_regret.append(output['regret'])
        mean_regret.append(output['regret']/output['rounds'])
        total_banners.append(output['total_banners'])
        times.append(end - start)
        
        print(total_regret[-1])
        print(times[-1])

alpha : 1, beta 10
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
845.0379905762129
715.2607271671295
alpha : 1, beta 20
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impres

In [15]:
a = 1
b = 100
alpha.append(a)
beta.append(b)

# seed for homework
seed = 18475
np.random.seed(seed=seed)
thompson_sampling_policy = MultiArmedBanditThompsonSampling(a, b)

start = time.time()
output = simulation(thompson_sampling_policy, n=200000, seed=seed)
end = time.time()
end - start

total_regret.append(output['regret'])
mean_regret.append(output['regret']/output['rounds'])
total_banners.append(output['total_banners'])
times.append(end - start)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


In [16]:
results = pd.DataFrame({'Alpha' : alpha,                        
                        'Beta' : beta,
                        'Total Regret' : total_regret,
                        'Mean Regret' : mean_regret,
                        'Total Banners' : total_banners,
                        'Time' : times})
results

Unnamed: 0,Alpha,Beta,Total Regret,Mean Regret,Total Banners,Time
0,1,1,1199.450212,0.005997,184,705.809658
1,1,10,845.037991,0.004225,184,715.260727
2,1,20,505.5406,0.002528,184,720.732769
3,1,50,344.193317,0.001721,184,725.656679
4,2,1,1467.798101,0.007339,184,713.828681
5,2,10,1209.557409,0.006048,184,713.182884
6,2,20,813.675418,0.004068,184,740.001664
7,2,50,226.283319,0.001131,184,743.30812
8,1,100,7313.152318,0.036566,184,742.010126


Как видим, при $\beta=50$ и $\alpha=2$ достигается минимальный результат для regret