# Multiarmed bandit homework

## Setup

In [2]:
import numpy as np
import pandas as pd

import time
from functools import partial
from scipy.stats import randint, uniform

from sim_lib import simulation

pd.options.mode.chained_assignment = None

In [3]:
# Seed for homework
seed = 18475
np.random.seed(seed=seed)

## Baseline

__Epsilon-greedy policy__

In [4]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history["clicks"] / (history["impressions"] + 10)
    n = np.argmax(ctr)
    return history.index[n]

In [5]:
policy = partial(eps_greedy, eps=0.06)
output = simulation(policy, n=200000, seed=seed)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


In [15]:
display(output["history"])

print(f"Regret score: {output['regret']} ({output['rounds']} rounds)")
print(f"Total banners: {output['total_banners']}")

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


Regret score: 1540.7609683932544 (200000 rounds)
Total banners: 184


## Custom solution

__Upper confidence bound (UCB)__

In [19]:
def UCB(history, C):
    """
    history – history log
    C – exploration part coefficient
    """
    
    # Exploitation
    explt = history["clicks"] / (history["impressions"] + 1)
    
    # Exploration
    t = np.sum(history["impressions"])
    explr = np.sqrt(2 * np.log(t + 1) / (history["impressions"] + 1))
    
    # Best arm-banner (max upper bound)
    arm = np.argmax(explt + C * explr)
    
    res = history.index[arm]
    return res

In [None]:
# Experiments!
exploration_coeffs = [1e-3, 1e-2, 1e-1, 1, 10]

best_regret = np.inf
best_coeff = None

for i, ec in enumerate(exploration_coeffs):
    print("="*107)
    print(f"Experiment {i+1}: C = {ec}")
    
    # Define policy and run simulation
    policy = partial(UCB, C=ec)
    output = simulation(policy, n=200000, seed=seed)
    
    # Check and save best exploration coeff
    if output["regret"] < best_regret:
        best_regret = output["regret"]
        best_coeff = ec
    
    # Some info for verbosing
    print(f"\nExperiment {i + 1} results:")
    display(output["history"])

    print(f"Regret score: {output['regret']} ({output['rounds']} rounds)")
    print(f"Total banners: {output['total_banners']}\n")

Experiment 1: C = 0.001
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated

Experiment 1 results:


Unnamed: 0,impressions,clicks,lifetime,p
153,3.0,0.0,18003.025431,0.220134
162,3.0,0.0,1537.166719,0.11378
172,11587.0,2536.0,19648.592394,0.219968
173,2.0,0.0,12771.47499,0.122694
180,0.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,0.0,0.0,15187.163761,0.073886


Regret score: 7019.4392434711635 (200000 rounds)
Total banners: 184

Experiment 2: C = 0.01
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated

Experiment 2 results:


Unnamed: 0,impressions,clicks,lifetime,p
153,19858.0,4383.0,18003.025431,0.220134
162,0.0,0.0,1537.166719,0.11378
172,0.0,0.0,19648.592394,0.219968
173,0.0,0.0,12771.47499,0.122694
180,0.0,0.0,4655.819793,0.020061
182,0.0,0.0,889.624649,0.004621
183,0.0,0.0,15187.163761,0.073886


Regret score: 9261.075181316619 (200000 rounds)
Total banners: 184

Experiment 3: C = 0.1
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated

Experiment 3 results:


Unnamed: 0,impressions,clicks,lifetime,p
153,4.0,0.0,18003.025431,0.220134
162,14.0,1.0,1537.166719,0.11378
172,19837.0,4508.0,19648.592394,0.219968
173,3.0,0.0,12771.47499,0.122694
180,10.0,1.0,4655.819793,0.020061
182,3.0,0.0,889.624649,0.004621
183,3.0,0.0,15187.163761,0.073886


Regret score: 197.75165655667817 (200000 rounds)
Total banners: 184

Experiment 4: C = 1
1 impressions have been simulated
10001 impressions have been simulated


__Small conclusion__  
  
_Не все эксперименты успели досчитаться, но уже есть хорошие результаты_
Best regret score on UCB policy: `regret_score = 197.752` was achived with `exploration_coefficient = 0.1`  
Baseline with `regret_score = 1540.761` defeated!

**_Next policy: Thompson sampling (be in future releases)_**