In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [3]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [4]:
print('values coeff_sum0: \n', data['coeff_sum0'].value_counts())
print('min, max: ', data['coeff_sum0'].min(), data['coeff_sum0'].max())

print('values coeff_sum1: \n', data['coeff_sum1'].value_counts())
print('min, max: ', data['coeff_sum1'].min(), data['coeff_sum1'].max())

values coeff_sum0: 
  0.000000    313087
-3.862115      3476
-3.947828      3468
-4.191956      3313
-4.191673      3089
              ...  
-5.324318         1
-3.504508         1
-3.400683         1
-3.170966         1
-3.025758         1
Name: coeff_sum0, Length: 5262825, dtype: int64
min, max:  -8.58897042950208 0.314998119166813
values coeff_sum1: 
  0.000000    357723
-4.518763      3356
-3.886009      2931
-3.778777      2804
-3.390108      2768
              ...  
-4.799067         1
-5.950338         1
-4.111814         1
-4.028938         1
-3.063858         1
Name: coeff_sum1, Length: 5660517, dtype: int64
min, max:  -9.56218775991817 0.475618061292297


In [5]:
print('values g0: \n', data['g0'].value_counts())
print('min, max: ', data['g0'].min(), data['g0'].max())
print('rate non-zero values: ', (data['g0'] > 0).sum() / len(data['g0']))

print('values g1: \n', data['g1'].value_counts())
print('min, max: ', data['g0'].min(), data['g0'].max())
print('rate non-zero values: ', (data['g0'] > 0).sum() / len(data['g0']))

values g0: 
 0.000000    313087
0.013990        25
0.020647        18
0.022180        18
0.454084        18
             ...  
0.025541         1
0.041918         1
0.024818         1
0.027239         1
0.058530         1
Name: g0, Length: 15147522, dtype: int64
min, max:  -0.0176373981227117 691.088787242959
rate non-zero values:  0.9802065825480714
values g1: 
 0.000000    357723
0.052409        25
0.014947        18
3.313204        18
0.047231        18
             ...  
0.044497         1
0.028938         1
0.516622         1
0.016753         1
0.078144         1
Name: g1, Length: 15169168, dtype: int64
min, max:  -0.0176373981227117 691.088787242959
rate non-zero values:  0.9802065825480714


In [3]:
print('Is nan in data: ', data.isnull().values.any())

Is nan in data:  True


* impressions традиционно выкидываем
* в g0, g1 большая часть данных что-то полезное (не ноль), но есть отрицательные значения. Оставим признаки, но отбросим строчки с отрицательным значением (как будто их быть не должно)
* date_time тоже брать не будем: по опыту предыдущих домашек этот признак не сильно помогает, но размер датасета увеличивает
* удалим строчки с nan

In [4]:
def feature_engineering(data_: pd.DataFrame):
    data = data_.copy()

    # drop features
    data = data.drop(["impressions", "campaign_clicks"], axis=1)
    data = data.dropna()

    data['date_time'] = pd.to_datetime(data['date_time'])
    data = data[data['banner_id'] == data['banner_id0']]
    data = data[data['g0'] >= 0]
    data = data[data['g1'] >= 0]

    train = data[data['date_time'].dt.date <= pd.to_datetime('2021-10-01')]
    test = data[data['date_time'].dt.date == pd.to_datetime('2021-10-02')]

    features = ["g0", "g1", "rate0", "rate1", "banner_id0", "banner_id1", "coeff_sum0", "coeff_sum1", "os_id", "zone_id", "banner_id", "country_id", "oaid_hash"]

    X_train = train[features].drop(["banner_id0", "banner_id1"], axis=1)

    X_test = test[features]
    X_test_banner1 = X_test.copy()
    X_test_banner1['banner_id'] = X_test_banner1['banner_id1']
    X_test_banner1 = X_test_banner1.drop(["banner_id0", "banner_id1"], axis=1)

    X_test_banner0 = X_test.drop(["banner_id0", "banner_id1"], axis=1)

    y_train, y_test = train['clicks'], test['clicks']

    enc = OneHotEncoder(handle_unknown='ignore')
    cat = ["os_id", "zone_id", "banner_id", "country_id", "oaid_hash"]
    num = ["g0", "g1", "rate0", "rate1", "coeff_sum0", "coeff_sum1"]
    enc = enc.fit(data[cat])
    X_train = enc.transform(X_train[cat])
    X_test_banner1 = enc.transform(X_test_banner1[cat])
    X_test_banner0 = enc.transform(X_test_banner0[cat])

    return X_train, X_test_banner0, X_test_banner1, X_test[num], y_train, y_test

X_train, X_test_banner0, X_test_banner1, X_test_numeric, y_train, y_test = feature_engineering(data)

  result = libops.scalar_compare(x.ravel(), y, op)


In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=42, C=1)
model.fit(X_train, y_train)

LogisticRegression(C=1, random_state=42, solver='liblinear')

In [8]:
from sklearn.metrics import log_loss, roc_auc_score

baseline = np.full(y_test.shape, np.mean(y_test))
results_roc = round(roc_auc_score(y_test, model.predict_proba(X_test_banner0)[:, 1]), 3)
results_loss = round(log_loss(y_test, model.predict_proba(X_test_banner0)), 3)
print(f'Linear model: roc {results_roc}, loss {results_loss}')

Linear model: roc 0.816, loss 0.126


In [9]:
from scipy.special import logit

X_test_numeric['pred_coeff_sum0'] = logit(model.predict_proba(X_test_banner0)[:, 1])
X_test_numeric['pred_coeff_sum1'] = logit(model.predict_proba(X_test_banner1)[:, 1])

Вероятность того, что одна нормальная величина больше другой: $P(X > Y) = P(Y - X < 0) = Ф(\frac{-\mu_{Y-X}}{\sigma_{Y-X}})$

Разность двух нормальных величин будет распределена как: $N(\mu_Y - \mu_X, \sigma_X^2 + \sigma_Y^2)$

Поэтому подставляем: $\mu_{Y-X} = \mu_Y - \mu_X$, $\sigma_{Y-X} = \sqrt{\sigma_X^2 + \sigma_Y^2}$

In [10]:
from scipy.stats import norm

def calc_pi(coeff_sum0, coeff_sum1, g0, g1):
    return norm.cdf( (coeff_sum0 - coeff_sum1) / (np.sqrt(g0 ** 2 + g1 ** 2) + 1e-6) )

X_test_numeric['pi_0'] = X_test_numeric.apply(lambda x: calc_pi(x.coeff_sum0, x.coeff_sum1, x.g0, x.g1), axis=1)
X_test_numeric['pi_1'] = X_test_numeric.apply(lambda x: calc_pi(x.pred_coeff_sum0, x.pred_coeff_sum1, x.g0, x.g1), axis=1)

In [11]:
print('cips: ', np.sum(y_test * np.clip(X_test_numeric['pi_1'] / X_test_numeric['pi_0'], a_min=None, a_max=10)) / len(y_test))

cips:  0.07543042300848891
