In [1]:
import matplotlib.pyplot as plt
import pandas as pd

### Данные
Данные - это реальный лог реальных рекламных событий ad network Propeller Ads, тот же, что и в первой домашней работе.

Датасет состоит из показов рекламы.

- date_time - время показа рекламы
- zone_id - id зоны, где зона - место на сайте для размещения рекламы
- banner_id - id баннера
- os_id - id операционной системы
- country_id - id страны


- oaid_hash - хэш юзера
- banner_id0 - нулевой баннер в “стакане” баннеров
- banner_id1 - перый баннер в “стакане” баннеров
- rate0 - стоимость 1 клика установленная рекламодателем для banner_id0
- rate1 - стоимость 1 клика установленная рекламодателем для banner_id1
- g0 - стандартное отклонение суммы коэффициентов с banner_id0
- g1 - стандартное отклонение суммы коэффициентов с banner_id1
- coeff_sum0 - сумма коэффициентов для banner_id0
- coeff_sum1 - сумма коэффициентов для banner_id1

- clicks - был ли клик

Для Домашней работы 4, колонка campaign_clicks не нужна.


In [38]:
df = pd.read_csv('./data/data.csv', parse_dates=["date_time"], date_parser=pd.to_datetime)
not_needed = ["campaign_clicks", "impressions"]
df.drop(labels=not_needed, axis=1, inplace=True)
df.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1


# Подготовим трейн и тест

In [39]:
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)


def feature_engineering(df: pd.DataFrame) -> tuple[np.array, np.array, np.array, np.array, np.array, np.array, np.array]:
    # Delete
    index = df[(df.date_time.dt.day == 1) & (df.date_time.dt.month == 9)].index
    df.drop(index, inplace=True)

    # Negative std
    index = df[(df.g0 < 0) | (df.g1 < 0)].index
    df.drop(index, inplace=True)

    # Delete raws with banner_id != banner_id_0
    index = df[df.banner_id != df.banner_id0].index
    df.drop(index, inplace=True)
    df.drop(['banner_id0'], axis=1, inplace=True)

    # Date Time - leave only "day of the week" variable
    df["day"] = df["date_time"].dt.dayofweek

    # Drop NA
    df.dropna(inplace=True)

    # Train test split for logistic regression
    train_set = df[df["date_time"].dt.date < pd.to_datetime('2021-10-02')]
    test_set = df[df["date_time"].dt.date == pd.to_datetime('2021-10-02')]


    # Prepare features
    not_needed = ["clicks", "date_time", "banner_id1", "g0", "g1", "coeff_sum0",  "coeff_sum1"]
    x_train = train_set.drop(not_needed, axis=1, inplace=False)
    x_test = test_set.drop(not_needed, axis=1, inplace=False)

    # ONE HOT encoding
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(handle_unknown='ignore', sparse=True, drop='first')
    x_train = enc.fit_transform(x_train)
    x_test = enc.transform(x_test)

    # Prepare labels
    y_train = train_set['clicks']
    y_test = test_set['clicks']


    # distribution
    dist = test_set[["g0", "g1", "coeff_sum0", "coeff_sum1"]]

    # Prepare test
    not_needed = ["clicks", "date_time", "g0", "g1", "coeff_sum0",  "coeff_sum1"]
    test_set.drop(not_needed, axis=1, inplace=True)

    # For p0
    x_test_0 = test_set.drop(["banner_id1"], axis=1, inplace=False)
    x_test_0 = enc.transform(x_test_0)

    # For p1
    test_set["banner_id"] = test_set["banner_id1"]
    x_test_1 = test_set.drop(["banner_id1"], axis=1)
    x_test_1 = enc.transform(x_test_1)

    return x_train, y_train, x_test, y_test, x_test_0, x_test_1, dist

In [40]:
x_train, y_train, x_test, y_test, x_test_0, x_test_1, dist = feature_engineering(df)

In [41]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape,  x_test_0.shape, x_test_1.shape, dist.shape

((12041809, 5029322),
 (12041809,),
 (1885668, 5029322),
 (1885668,),
 (1885668, 5029322),
 (1885668, 5029322),
 (1885668, 4))

# Модель и крос валидация

In [46]:
from sklearn.linear_model import LogisticRegression

def create_model():
    log_reg = LogisticRegression(solver='liblinear', random_state=42, C=0.001)
    return log_reg

In [48]:
%%time
model = create_model()
model.fit(x_train, y_train)

CPU times: user 1min 23s, sys: 1min 20s, total: 2min 43s
Wall time: 54.9 s


In [49]:
from sklearn.metrics import log_loss, roc_auc_score
import numpy as np


baseline = np.full(y_test.shape, np.mean(y_test))
results_roc = round(roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]), 3)
results_loss = round(log_loss(y_test, model.predict_proba(x_test)), 3)
print(f'Best model : {results_roc}, {results_loss}')
print(f'Baseline: {round(roc_auc_score(y_test, baseline), 3)}, {round(log_loss(y_test, baseline), 3)}')

Best model : 0.784, 0.136
Baseline: 0.5, 0.155


In [51]:
from scipy.special import logit
from scipy.stats import norm


coeff_sum0_new = logit(model.predict_proba(x_test_0)[:, 1])
coeff_sum1_new = logit(model.predict_proba(x_test_1)[:, 1])


In [52]:
pi_0 = norm.cdf((dist['coeff_sum1'] - dist['coeff_sum0'])
                / np.sqrt(dist['g0'] ** 2 + dist['g1'] ** 2))

pi_1 = norm.cdf((coeff_sum1_new - coeff_sum0_new)
                / np.sqrt(dist['g0'] ** 2 + dist['g1'] ** 2))

In [74]:
cips = np.mean(y_test * np.clip(pi_1/(pi_0 + 1e-10), None, 10))
cips

0.13302891600917813