In [1]:
from functools import partial
import time
from typing import Callable
import numpy as np
import pandas as pd
from scipy import stats
from sim_lib import simulation

pd.options.mode.chained_assignment = None

In [2]:
def evaluate_policy(policy: Callable, seed: int = 18475) -> dict:
    # seed for homework
    seed = 18475
    np.random.seed(seed=seed)

    start = time.time()
    output = simulation(policy, n=200000, seed=seed)
    end = time.time()

    print(f"Simulation time: {end - start:.3f} s")
    print(f"Total regret: {output['regret']}")
    print(f"Regret / rounds: {output['regret'] / output['rounds']}")
    print(f"Total banners: {output['total_banners']}")
    return output

## Eps-greedy (из примера)

In [3]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if stats.uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[stats.randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

In [4]:
output = evaluate_policy(partial(eps_greedy, eps=0.06))
output["history"]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Simulation time: 592.700 s
Total regret: 1540.7609683932544
Regret / rounds: 0.007703804841966272
Total banners: 184


Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


## Upper Confidence Bound

In [5]:
def ucb(history: pd.DataFrame, ni_min: int = 10, gamma: float = 1.0):
    ni = np.maximum(history["impressions"], ni_min)
    theta = history["clicks"] / ni
    t = ni.sum()
    ci = (2 * np.log(t) / ni)
    i = np.argmax(theta + gamma * ci)
    return history.index[i]

In [6]:
output = evaluate_policy(partial(ucb, ni_min=10, gamma=0.1))
output["history"]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Simulation time: 627.003 s
Total regret: 110.39099426399599
Regret / rounds: 0.00055195497131998
Total banners: 184


Unnamed: 0,impressions,clicks,lifetime,p
153,19848.0,4331.0,18003.025431,0.220134
162,0.0,0.0,1537.166719,0.11378
172,0.0,0.0,19648.592394,0.219968
173,0.0,0.0,12771.47499,0.122694
180,0.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,0.0,0.0,15187.163761,0.073886


Результаты проведенных тестов (см. файл `bandits.py`)

|   |      regret | rounds | total_banners | ni_min | gamma|
|---|-------------|--------|---------------|--------|------|
|0  | 2985.412955 | 200000 |           184 |      2 | 0.001|
|1  | 1432.269049 | 200000 |           184 |      2 | 0.010|
|2  |  398.318081 | 200000 |           184 |      2 | 0.100|
|3  | 3399.684654 | 200000 |           184 |      2 | 1.000|
|4  | 5923.768694 | 200000 |           184 |     10 | 0.001|
|5  | 5923.768694 | 200000 |           184 |     10 | 0.010|
|6  |  110.390994 | 200000 |           184 |     10 | 0.100|
|7  | 3412.880886 | 200000 |           184 |     10 | 1.000|
|8  | 6880.916714 | 200000 |           184 |     20 | 0.001|
|9  | 6880.916714 | 200000 |           184 |     20 | 0.010|
|10 | 3138.516322 | 200000 |           184 |     20 | 0.100|
|11 | 3417.663378 | 200000 |           184 |     20 | 1.000|
|12 | 8363.522799 | 200000 |           184 |     50 | 0.001|
|13 | 8363.522799 | 200000 |           184 |     50 | 0.010|
|14 | 7487.286281 | 200000 |           184 |     50 | 0.100|
|15 | 3406.256220 | 200000 |           184 |     50 | 1.000|

## Thompson Sampling

In [7]:
def thompson(history: pd.DataFrame, alpha: float = 1., beta: float = 1.):
    k = history["clicks"]
    n = history["impressions"]
    a = alpha + k
    b = beta + n - k
    samples = [stats.beta.rvs(a_i, b_i) for a_i, b_i in zip(a, b)]
    i = np.argmax(samples)
    return history.index[i]

In [8]:
output = evaluate_policy(partial(thompson, alpha=1.0, beta=50.0))
output["history"]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Simulation time: 657.839 s
Total regret: 129.59242326828794
Regret / rounds: 0.0006479621163414397
Total banners: 184


Unnamed: 0,impressions,clicks,lifetime,p
153,16.0,3.0,18003.025431,0.220134
162,1.0,0.0,1537.166719,0.11378
172,19803.0,4382.0,19648.592394,0.219968
173,6.0,1.0,12771.47499,0.122694
180,0.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,0.0,0.0,15187.163761,0.073886


Результаты проведенных тестов (см. файл `bandits.py`)

|   |      regret | rounds | total_banners | alpha | beta|
|---|-------------|--------|---------------|-------|------
|0  |  898.926128 | 200000 |           184 |   0.1 |  0.1|
|1  |  789.778888 | 200000 |           184 |   0.1 |  1.0|
|2  |  425.738000 | 200000 |           184 |   0.1 | 10.0|
|3  | 6633.268969 | 200000 |           184 |   0.1 | 50.0|
|4  | 1077.248709 | 200000 |           184 |   0.5 |  0.1|
|5  | 1067.258042 | 200000 |           184 |   0.5 |  1.0|
|6  |  649.704942 | 200000 |           184 |   0.5 | 10.0|
|7  |  302.300936 | 200000 |           184 |   0.5 | 50.0|
|8  | 1297.521119 | 200000 |           184 |   1.0 |  0.1|
|9  | 1223.610364 | 200000 |           184 |   1.0 |  1.0|
|10 |  923.393581 | 200000 |           184 |   1.0 | 10.0|
|11 |  129.592423 | 200000 |           184 |   1.0 | 50.0|
|12 | 2167.432744 | 200000 |           184 |   5.0 |  0.1|
|13 | 2125.018108 | 200000 |           184 |   5.0 |  1.0|
|14 | 1820.372268 | 200000 |           184 |   5.0 | 10.0|
|15 |  509.913556 | 200000 |           184 |   5.0 | 50.0|

Видно, что оба бандита оказались достаточно жадными -- при наличии достаточно удачного действия они даже не пробуют новые.