In [1]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [3]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


212.85634803771973

In [6]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

In [7]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


### Thompson sampling policy

In [8]:
from scipy.stats import beta

class TS:
    def __init__(self, alpha: int, beta: int):
        self.a = alpha
        self.b = beta
        self.prev_action = None
        self.prev_history = None
        self.banners_dict = dict()
        
    def __call__(self, history: pd.DataFrame):
        
        # Сохраняем начальное случайное состояние (чтобы симуляция была идентичной eps_greedy политике)
        state = np.random.get_state()
        
        # Обновляем список текущих баннеров
        self.banners_dict = {k: v for (k, v) in self.banners_dict.items() if k in history.index}
        for index in history.index:
            if index not in self.banners_dict:
                self.banners_dict[index] = [self.a, self.b]
        
        # Считаем награду
        reward = self.calc_reward(history)
        
        # Если был клик, добавляем 1 к alpha соответствующего баннера, иначе отнимаем 1 от beta
        if reward == 1:
            self.banners_dict[self.prev_action][0] += 1
        if reward == -1:
            self.banners_dict[self.prev_action][1] += 1
        
        # Cемплируем действие
        action = self.sample_action()
        
        # Обновляем prev_action и prev_history
        self.prev_action = action
        self.prev_history = history
        
        # Возвращаем начальное случайное состояние и получаем действие при помощи eps_greedy политики
        # (чтобы симуляция была идентичной eps_greedy политике)
        np.random.set_state(state)
        _ = policy(history)
        
        return action
        
    def calc_reward(self, history: pd.DataFrame):
        if self.prev_history is None:
            return 0
        if self.prev_action not in history.index:
            return 0
        return (history.loc[self.prev_action, 'clicks'] - 
                self.prev_history.loc[self.prev_action, 'clicks']) * 2 - 1
    
    def sample_action(self):
        
        best_banner = None
        max_theta = 0
        
        for banner, params in self.banners_dict.items():
            theta = beta.rvs(*params)
            
            if theta > max_theta:
                best_banner = banner
                max_theta = theta
        
        return best_banner

Возьмем равномерное распределение $\text{Beta}(1, 1)$ в качестве априорного

In [9]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

TS_policy = TS(1, 1)

start = time.time()
output = simulation(TS_policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


283.08031487464905

In [12]:
# TS - Beta(1, 1) regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1142.7725594275578, 0.005713862797137789, 184)

In [13]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,632.0,109.0,18003.025431,0.220134
162,220.0,29.0,1537.166719,0.11378
172,20237.0,4506.0,19648.592394,0.219968
173,303.0,44.0,12771.47499,0.122694
180,22.0,0.0,4655.819793,0.020061
182,19.0,0.0,889.624649,0.004621
183,24.0,1.0,15187.163761,0.073886


Из файла ```sim_lib.py``` понятно, что параметр p для каждого банера семплируется из распределения $\text{Beta}(1, 20)$. Исходя из этого, а также из логики, что низкая вероятность клика характерна для большинства баннеров, рассмотрим в качестве априорного распределение $\text{Beta}(1, 20)$

In [14]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

TS_policy = TS(1, 20)

start = time.time()
output = simulation(TS_policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


283.13370394706726

In [15]:
# TS - Beta(1, 20) regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(416.21060439113535, 0.0020810530219556767, 184)

In [16]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,29027.0,6468.0,18003.025431,0.220134
162,26.0,1.0,1537.166719,0.11378
172,629.0,115.0,19648.592394,0.219968
173,63.0,6.0,12771.47499,0.122694
180,4.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,3.0,0.0,15187.163761,0.073886


Очевидной проблемой TS, выявленной в ходе выполнения, явлеяется то, что клик по новому баннеру сильно меняет апостериорное распределение (н-р от  $\text{Beta}(1, 20)$ к  $\text{Beta}(2, 20)$). Из-за этого такой баннер будет выбираться чаще, прежде чем его апостериорное распределение приблизится к истинному. В звязи с этим я попробовал увеличить априорное значение параметра $\beta$, чтобы снизить влияние кликов по новым баннерам.

In [17]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

TS_policy = TS(1, 50)

start = time.time()
output = simulation(TS_policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


282.6952168941498

In [19]:
# TS - Beta(1, 50) regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(131.7253439665951, 0.0006586267198329755, 184)

In [20]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,21368.0,4736.0,18003.025431,0.220134
162,16.0,1.0,1537.166719,0.11378
172,0.0,0.0,19648.592394,0.219968
173,0.0,0.0,12771.47499,0.122694
180,0.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,0.0,0.0,15187.163761,0.073886


Также я попробовал другие параметры априорного бета-распределения:
- увеличение exploitation:

    - $\text{Beta}(1.5, 30)$: (335.304692002386, 0.0016765234600119299, 184)
    - $\text{Beta}(2, 40)$: (228.69432126251496, 0.0011434716063125749, 184)
    - $\text{Beta}(3, 60)$: (291.4213570945607, 0.0014571067854728034, 184)
    - $\text{Beta}(10, 200)$: (387.12372923473913, 0.0019356186461736956, 184)
    

- влияние параметра $\beta$

    - $\text{Beta}(1, 10)$: (783.5726370426416, 0.003917863185213208, 184)
    - $\text{Beta}(1, 30)$: (227.11568861560673, 0.0011355784430780336, 184)
    - $\text{Beta}(1, 40)$: (169.11391099978036, 0.0008455695549989018, 184)
    - $\text{Beta}(1, 60)$: (277.42508099730816, 0.0013871254049865407, 184)

### Результаты

Наилучшей TS политикой оказалась политика с априорным распределением $\text{Beta}(1, 50)$, при которой regret состаил **131.72**, что превосходит regret бейслайна **1540.76**