In [1]:
import numpy as np
import pandas as pd
import time
import json
from functools import partial
from typing import Dict, Callable, Union
from tqdm.notebook import tqdm
from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

Вынесем код по оценке полиси в отдельную функции. В ней, как раз, соберутся параметры из условия домашки (seed, количество итераций в симуляции)

In [3]:
def eval_policy(policy: Callable) -> Dict[str, Union[int, float]]:
    # seed for homework
    seed = 18475
    np.random.seed(seed=seed)

    output = simulation(policy, n=200000, seed=seed)
    return {
        'regret': output['regret'],
        'mean_regret': output['regret'] / output['rounds'],
        'total_banners': output['total_banners']
    }

def pretty_print(data: Dict):
    print(json.dumps(data, indent=4))

Узнаем бейзлайн метрики

In [8]:
# baseline regret
eps_greedy_policy = partial(eps_greedy, eps=0.06)

start = time.time()
baseline_result = eval_policy(eps_greedy_policy)
end = time.time()

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


In [9]:
print(f"Execution time: {end - start}")
print("Baseline metrics:")
pretty_print(baseline_result)

Execution time: 159.29981470108032
Baseline metrics:
{
    "regret": 1540.7609683932544,
    "mean_regret": 0.007703804841966272,
    "total_banners": 184
}


# Реализация своей policy

Будем использовать метод `UCB`, так как в нём exploration можно регулировать всего одной константой

In [6]:
class UCB:
    def __init__(self, C):
        self._C = C
        self._total_calls = 0

    def __call__(self, history: pd.DataFrame):
        self._total_calls += 1

        # Добавим +1, чтобы не возникло никаких проблем с 0
        ctr = history['clicks'] / (history['impressions'] + 1)
        exploration = np.sqrt(2 * np.log(self._total_calls) / (history.impressions + 1))
        u = ctr + self._C * exploration

        banner_pos = np.argmax(u)
        return history.index[banner_pos]

# Оптимизация `exploration`/`exploitation`

Определимся с порядком коэффициента при `exploration`.

На лекции обсуждалось, что в случае mortal бандита, обычно необходимо меньше `exploration`, чем обычно; поэтому среди перебираемых значений преимущественно рассмотрим величины меньше `1` (стандартный коэффициент при `exploration` в `UCB`).

In [7]:
search_result = {}
for C in tqdm([5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]):
    search_result[C] = eval_policy(UCB(C))['regret']

  0%|          | 0/8 [00:00<?, ?it/s]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impr

In [10]:
print("UCB <exploration coeff>: <regret>")
pretty_print(search_result)

UCB <exploration coeff>: <regret>
{
    "5": 26394.284284349633,
    "1": 11185.91629936366,
    "0.5": 4013.0258661502003,
    "0.1": 258.3158253051061,
    "0.05": 2424.8962079893777,
    "0.01": 974.1013870333624,
    "0.005": 7570.250349143192,
    "0.001": 7570.250349143192
}


По результатам видно, что из рассмотренных значений наиболее хороший коэффициенты при `exploration` части -- это `0.1`, а следующий за ним `0.01`. Попробуем дополнительно поискать лучший коэффициент вокруг данных значений.

In [11]:
search_result_fine_grained = {}
for C in tqdm([0.02, 0.03, 0.04, 0.06, 0.07, 0.08, 0.09, 0.11, 0.12]):
    search_result_fine_grained[C] = eval_policy(UCB(C))['regret']

  0%|          | 0/9 [00:00<?, ?it/s]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impr

In [12]:
print("UCB <exploration coeff>: <regret>")
pretty_print(search_result_fine_grained)

UCB <exploration coeff>: <regret>
{
    "0.02": 1395.674034831114,
    "0.03": 499.6321605018916,
    "0.04": 242.03395440174864,
    "0.06": 224.59694995443272,
    "0.07": 152.6182167021829,
    "0.08": 197.72686196531987,
    "0.09": 228.79558491386356,
    "0.11": 260.841092152149,
    "0.12": 324.9506848229653
}


Лучший коэффициент, который удалось найти: `0.07`.

Чтобы подытожить, оценим лучшую полиси ещё раз:

In [13]:
best_ucb_policy = UCB(0.07)

best_ucb_result = eval_policy(best_ucb_policy)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


In [14]:
print("Best UCB metrics:")
pretty_print(best_ucb_result)

Best UCB metrics:
{
    "regret": 152.6182167021829,
    "mean_regret": 0.0007630910835109146,
    "total_banners": 184
}


Бейзлайн обошли, баланс `exploitation`/`exploration` пооптимизировали)