In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [3]:
def ucb(history: pd.DataFrame, a: float):
    ctr = (history['clicks'] / (history['impressions']+1)) + a * np.sqrt(2 * np.log(np.sum(history['impressions']+1)) / (history['impressions']+1))
    n = np.argmax(ctr)
    return history.index[n]

In [4]:
for a in [0.05, 0.25, 1, 2]:
    print(f'UCB with exploration ratio {a}')
    policy_ucb = partial(ucb, a=a)

    # seed for homework
    seed = 18475
    np.random.seed(seed=seed)

    start = time.time()
    output = simulation(policy_ucb, n=200000, seed=seed)
    end = time.time()
    print(f'Time: {end - start}')

    # ucb regret
    print(f"Regret {output['regret']}, avg_regret_per_round {output['regret']/output['rounds']}, total banners {output['total_banners']}")

    # ucb history
    print("History:")
    print(output['history'])
    print()

UCB with exploration ratio 0.05
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 894.7389967441559
Regret 2393.889846978175, avg_regret_per_round 0.011969449234890874, total banners 184
History:
     impressions  clicks      lifetime         p  alpha   beta
153          4.0   

In [5]:
from scipy.stats import beta

def ts(history: pd.DataFrame):
    bts = beta.rvs(history['alpha'], history['beta'])
    n = np.argmax(bts)
    return history.index[n]

In [6]:
for ts_explore in [0.25, 1, 2, 4, 8]:
    print(f'TS with exploration ratio {ts_explore}')
    policy_ts = ts
    # seed for homework
    seed = 18475
    np.random.seed(seed=seed)

    start = time.time()
    output = simulation(policy_ts, n=200000, seed=seed, ts_explore=ts_explore)
    end = time.time()
    print(f'Time: {end - start}')

    # ts regret
    print(f"Regret {output['regret']}, avg_regret_per_round {output['regret']/output['rounds']}, total banners {output['total_banners']}")

    # ts history
    print("History:")
    print(output['history'])
    print()

TS with exploration ratio 0.25
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 542.3793306350708
Regret 1455.5218973879773, avg_regret_per_round 0.007277609486939887, total banners 184
History:
     impressions  clicks      lifetime         p  alpha     beta
153      18835.0 

In [7]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]


In [8]:
for eps in [0.06]:
    print(f"e-greedy with exploration ratio {eps}")
    policy_egreedy = partial(eps_greedy, eps=eps)
    # seed for homework
    seed = 18475
    np.random.seed(seed=seed)

    start = time.time()
    output = simulation(policy_egreedy, n=200000, seed=seed)
    end = time.time()
    print(f'Time: {end - start}')

    # e-greedy regret
    print(f"Regret {output['regret']}, avg_regret_per_round {output['regret']/output['rounds']}, total banners {output['total_banners']}")

    # e-greedy history
    print("History:")
    print(output['history'])
    print()

e-greedy with exploration ratio 0.06
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 599.8631372451782
Regret 1540.7609683932544, avg_regret_per_round 0.007703804841966272, total banners 184
History:
     impressions  clicks      lifetime         p  alpha   beta
153      1897

Итог: реализованы 2 полиси - UCB и TS. В UCB я добавил множитель к добавке, и самый оптимальный получился при a=0.25. Для TS я изменил simulation и в случае победы добавлял 1 к alpha, а в случае поражения ts_reward к beta баннера. Наиболее оптимальные ts_reward 2 и 4, они достаточно схожи. Итоговый наилучший regret получился похожим у UCB и TS и равен примерно 1050-1080, и это лучше бейзлайна, регрет которго 1540.