In [51]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import norm
from scipy.special import logit

In [52]:
data = pd.read_csv('../data/data.csv').sample(frac=0.01, random_state=1)

In [53]:
columns = []
columns += ["date_time", "zone_id", "banner_id", "os_id", "country_id", "clicks" ]
columns += ["banner_id0", "banner_id1", "g0", "g1", "coeff_sum0", "coeff_sum1" ]
data = data[columns]

In [54]:
data['date_time'] = pd.to_datetime(data['date_time'])

In [55]:
data = data[data['banner_id'] == data['banner_id0']] # по условию
data = data[(data['g0'] > 1e-6) & (data['g1'] > 1e-6)] # оставляем только адекватые значения

Выделим в valid_raw часть последний день, в train_raw - все кроме последнего.
train_c - категориальные признаки из train_raw,  valid_c - категориальные признаки из valid_raw

In [56]:
train_raw = data[data['date_time'].dt.date != data['date_time'].dt.date.max()]
valid_raw = data[data['date_time'].dt.date == data['date_time'].dt.date.max()]

train_categ = train_raw[["zone_id", "banner_id", "os_id", "country_id"]]
valid_categ = valid_raw[["zone_id", "banner_id", "os_id", "country_id"]]

In [57]:
valid_2_categ = valid_categ.copy()
valid_2_categ['banner_id'] = valid_raw['banner_id1']

Закодируем категории с помощью One-Hot-Encoding

In [58]:
ohe = OneHotEncoder(handle_unknown='ignore', max_categories=10**3).fit(train_categ)

In [59]:
train_categ_ohe_matrix = ohe.transform(train_categ)
valid_categ_ohe_matrix = ohe.transform(valid_categ)
valid_2_categ_ohe_matrix = ohe.transform(valid_2_categ)

In [60]:
col_names = [f'f_{i}' for i in range(train_categ_ohe_matrix.shape[1])]
train_c_ohe = pd.DataFrame.sparse.from_spmatrix(train_categ_ohe_matrix, columns=col_names)
valid_c_ohe = pd.DataFrame.sparse.from_spmatrix(valid_categ_ohe_matrix, columns=col_names)
valid_2_c_ohe = pd.DataFrame.sparse.from_spmatrix(valid_2_categ_ohe_matrix, columns=col_names)

In [61]:
x_train = train_c_ohe
y_train = train_raw['clicks']

x_valid = valid_c_ohe
y_valid = valid_raw['clicks']

x_valid_2 = valid_2_c_ohe
y_valid_2 = valid_raw['clicks']

Возьмем логистическую регрессию с солвером liblinear (не SGD).

In [62]:
def get_model(c):
    return LogisticRegression(solver='liblinear', C=c, random_state=0)

Проверим модель на valid части

In [63]:
model = get_model(1)
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)
auc_model = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
ll_model = log_loss(y_valid, model.predict_proba(x_valid))
print(f'auc = {auc_model}')
print(f'log_loss = {ll_model}')

auc = 0.7938738169726371
log_loss = 0.13456146021263932


In [64]:
def get_pi(m0, g0, m1, g1):
    ans = norm.sf(0, loc=(m0 - m1), scale=(g0 ** 2 + g1 ** 2)**0.5)
    print(f'ans = {ans}')
    return ans

In [65]:
pi_0 = get_pi(valid_raw['coeff_sum0'], valid_raw['g0'], valid_raw['coeff_sum1'], valid_raw['g1'])

ans = [0.99531359 0.99995273 0.50358735 ... 0.34792342 0.21687514 0.31317917]


In [66]:
coeff_sum0_new = logit(model.predict_proba(x_valid)[:, 1])
coeff_sum1_new = logit(model.predict_proba(x_valid_2)[:, 1])
pi_1 = get_pi(coeff_sum0_new, valid_raw['g0'], coeff_sum1_new, valid_raw['g1'])

ans = [4.25125685e-01 1.00000000e+00 1.00000000e+00 ... 6.18218558e-40
 3.60026346e-14 8.63698523e-03]


In [67]:
cips = np.mean(y_valid * np.minimum(pi_1 / np.maximum(pi_0, + 1e-10), 10))
print(cips)

0.056865539852275174
