In [1]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [3]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


251.7458233833313

In [4]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


# Upper confidence bound

In [8]:
class UpperConfidenceBound:
    def __init__(self, ee_tradeoff=0.1):
        self.t = 0
        self.ee_tradeoff = ee_tradeoff

    def __call__(self, history: pd.DataFrame):
        Q = history['clicks'] / (history['impressions'] + 1)
        self.t += 1
        U = Q + self.ee_tradeoff * np.sqrt(2*np.log(self.t) / (history['impressions'] + 1))
        idx = np.argmax(U)
        return history.index[idx]

 \+ Добавил в **simulation** параметр **verbose** для удобства и чистого вывода

In [9]:
%%time

output_ucb = simulation(UpperConfidenceBound(), n=200000, seed=seed, verbose=False)
print(output_ucb['regret'], output_ucb['regret']/output_ucb['rounds'],  output_ucb['total_banners'])

217.60194678688532 0.0010880097339344267 184
CPU times: total: 4min 54s
Wall time: 4min 54s


Regret меньше, чем 1540 у e-greedy.

Теперь оптимизируем Exploration exploitation trade-off.

Используем Grid Search, постепенно уменьшая сетку:

## Big grid

In [11]:
%%time

result = []

for p in [0.01, 0.05, 0.25, 0.5, 0.75, 1.0]:
    np.random.seed(seed=seed)
    output = simulation(UpperConfidenceBound(ee_tradeoff=p), n=50000, seed=seed, verbose=False)
    result.append((
        p,
        output['regret'],
        output['regret'] / output['rounds'],
        output['total_banners']
    ))
    print(f"EE_tradeoff = {p} done. Regret = {output['regret']}")

EE_tradeoff = 0.01 done. Regret = 941.0714982541851
EE_tradeoff = 0.05 done. Regret = 35.716164414616635
EE_tradeoff = 0.25 done. Regret = 333.889095409225
EE_tradeoff = 0.5 done. Regret = 917.8386885157926
EE_tradeoff = 0.75 done. Regret = 1617.3568859684008
EE_tradeoff = 1.0 done. Regret = 2393.717146557163
CPU times: total: 7min 22s
Wall time: 7min 23s


## Grid поменбше

In [12]:
%%time

for p in [0.03, 0.07, 0.11, 0.16, 0.2]:
    np.random.seed(seed=seed)
    output = simulation(UpperConfidenceBound(ee_tradeoff=p), n=50000, seed=seed, verbose=False)
    result.append((
        p,
        output['regret'],
        output['regret'] / output['rounds'],
        output['total_banners']
    ))
    print(f"EE_tradeoff = {p} done. Regret = {output['regret']}")

EE_tradeoff = 0.03 done. Regret = 429.1319619278395
EE_tradeoff = 0.07 done. Regret = 76.69553168349854
EE_tradeoff = 0.11 done. Regret = 80.02693105532725
EE_tradeoff = 0.16 done. Regret = 159.79794532913036
EE_tradeoff = 0.2 done. Regret = 225.69549883550025
CPU times: total: 6min 23s
Wall time: 6min 26s


## Grid совсем маленбкий жестб

In [13]:
%%time

for p in [0.045, 0.055, 0.060, 0.065, 0.075, 0.08]:
    np.random.seed(seed=seed)
    output = simulation(UpperConfidenceBound(ee_tradeoff=p), n=50000, seed=seed, verbose=False)
    result.append((
        p,
        output['regret'],
        output['regret'] / output['rounds'],
        output['total_banners']
    ))
    print(f"EE_tradeoff = {p} done. Regret = {output['regret']}")

EE_tradeoff = 0.045 done. Regret = 69.87816523407503
EE_tradeoff = 0.055 done. Regret = 173.8650584237794
EE_tradeoff = 0.06 done. Regret = 165.4904655668379
EE_tradeoff = 0.065 done. Regret = 48.115513622587045
EE_tradeoff = 0.075 done. Regret = 82.21118781700768
EE_tradeoff = 0.08 done. Regret = 89.829143452295
CPU times: total: 7min 26s
Wall time: 7min 26s


# Результаты

In [16]:
result_df = pd.DataFrame(data=result, columns=["p", "regret", "regret_m", "tb"]).sort_values("regret")
result_df.to_csv("gs_results.csv")
result_df.head(7)

Unnamed: 0,p,regret,regret_m,tb
1,0.05,35.716164,0.000714,52
14,0.065,48.115514,0.000962,52
11,0.045,69.878165,0.001398,52
7,0.07,76.695532,0.001534,52
8,0.11,80.026931,0.001601,52
15,0.075,82.211188,0.001644,52
16,0.08,89.829143,0.001797,52


In [17]:
best_p = 0.05

output_best = simulation(UpperConfidenceBound(ee_tradeoff=best_p), n=200000, seed=seed, verbose=False)
print(output_best['regret'], output_best['regret']/output_best['rounds'],  output_best['total_banners'])

265.116843147788 0.00132558421573894 184


Итого у UpperConfidenceBound при параметре ee_tradeoff = 0.05   
regret = 265