# Imports

In [1]:
from pathlib import Path
from tqdm.notebook import tqdm 

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder

from category_encoders import TargetEncoder
from scipy.sparse import hstack
from scipy.special import logit

import scipy.stats as ss
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 0

# Data

Считываем данные, дропаем запрещенные столбцы. 

In [3]:
%%time

data_raw = pd.read_csv('../data.csv').drop([
    "oaid_hash",
    "campaign_clicks"
], axis=1)
data_raw = data_raw.dropna() 
data_raw = data_raw.sort_values("date_time").reset_index().drop("index", axis=1)

data = data_raw.copy()
data.head()

CPU times: total: 57.7 s
Wall time: 57.8 s


Unnamed: 0,date_time,zone_id,banner_id,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-01 00:02:49.000000,30,596,0,7,596,0.05,0.06456,-4.312062,603,0.05,0.05651,-4.370191,1,0
1,2021-09-26 00:00:00.000000,243,21,0,0,21,0.014,0.040261,-3.638218,2,0.014,0.044512,-3.66321,1,0
2,2021-09-26 00:00:00.000000,0,7,4,1,7,0.01,0.016867,-4.164291,89,0.01,0.063781,-4.252917,1,0
3,2021-09-26 00:00:00.000000,47,73,4,13,73,0.008,0.120974,-2.382508,1040,0.008,0.157515,-3.037939,1,0
4,2021-09-26 00:00:00.000000,11,28,3,5,28,0.003,0.011322,-2.60517,18,0.004,0.050958,-2.946735,1,0


# Preprocessing

Использую тот же препроцессинг, что и в первой домашке

In [5]:
def preprocessing(data: pd.DataFrame):
    # Неинформативная и забаненная фичи
    data = data.drop(["impressions"], axis=1)
     
    # приводим к нормальному типу
    data["date_time"] = pd.to_datetime(data['date_time'])
    
    # Временные фичи
    data['month'] = data['date_time'].dt.month
    data['day_of_week'] = data['date_time'].dt.dayofweek
    data['day_of_week_t'] = data['date_time'].dt.dayofweek
    data['is_weekend'] = data['day_of_week'] >= 5
    data['hour'] = data['date_time'].dt.hour
    data['hour_t'] = data['date_time'].dt.hour
    
    last_day = datetime.date(2021, 10, 2)
    is_last_day_mask = (data['date_time'].dt.date == last_day)
    
    ##### Все считаем по трейну чтобы не было ликов
    # Флаги на топовые зоны \ баннеры
    top_zones = set(data["zone_id"][~is_last_day_mask].value_counts().head(3).index)
    data["is_top3_zone"] = data["zone_id"].apply(lambda x: x in top_zones)

    top_zones = set(data["zone_id"][~is_last_day_mask].value_counts().head(5).index)
    data["is_top5_zone"] = data["zone_id"].apply(lambda x: x in top_zones)

    top_zones = set(data["zone_id"][~is_last_day_mask].value_counts().head(20).index)
    data["is_top20_zone"] = data["zone_id"].apply(lambda x: x in top_zones)
    
    top_banners = set(data["banner_id"][~is_last_day_mask].value_counts().head(3).index)
    data["is_top3_banner"] = data["banner_id"].apply(lambda x: x in top_banners)

    top_banners = set(data["banner_id"][~is_last_day_mask].value_counts().head(5).index)
    data["is_top5_banner"] = data["banner_id"].apply(lambda x: x in top_banners)

    top_banners = set(data["banner_id"][~is_last_day_mask].value_counts().head(20).index)
    data["is_top20_banner"] = data["banner_id"].apply(lambda x: x in top_banners)
    
    
    # формируем трейн и тест
    data_train = data[~is_last_day_mask & (data['banner_id'] == data['banner_id0'])]
    data_test = data[is_last_day_mask & (data['banner_id'] == data['banner_id0'])].dropna(subset=['g0', 'g1', 'coeff_sum0', 'coeff_sum0'])
    
    # OHE категориальных фичей
    cat_features = ["zone_id", 'month', 'os_id', 'country_id', 'hour', 'day_of_week', "banner_id"]
    ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse=True, drop='first')
    X_train_cat = ohe_encoder.fit_transform(data_train[cat_features])
    
    X_test_cat_id0 = ohe_encoder.transform(
        data_test.drop("banner_id", axis=1).rename({"banner_id0": "banner_id"}, axis=1)[cat_features]
    )
    X_test_cat_id1 = ohe_encoder.transform(
        data_test.drop("banner_id", axis=1).rename({"banner_id1": "banner_id"}, axis=1)[cat_features]
    )
    
    # Target Encoding
    target_encoder = TargetEncoder(cols=["hour_t", 'day_of_week_t', 'month', 'os_id', 'country_id'])
    target_encoder.fit(data_train, data_train['clicks'])
    data_train = target_encoder.transform(data_train)
    data_test = target_encoder.transform(data_test)
    
    
    y_train, y_test = data_train["clicks"], data_test["clicks"]
    X_train = data_train.drop(["clicks", "date_time", "zone_id", "banner_id"], axis=1)
    X_test = data_test.drop(["clicks", "date_time", "zone_id", "banner_id"], axis=1)
    
    X_train = hstack((X_train_cat.astype(float), np.array(X_train).astype(float)))
    X_test_id0 = hstack((X_test_cat_id0.astype(float), np.array(X_test).astype(float)))
    X_test_id1 = hstack((X_test_cat_id1.astype(float), np.array(X_test).astype(float)))
    
    weights = dict(y_train.value_counts())
    return X_train, X_test_id0, X_test_id1, y_train.values, y_test.values, weights, data_test

# Create model

In [6]:
def create_model(**kwargs):
    return LogisticRegression(solver='liblinear', random_state=SEED, **kwargs)

# Train

In [7]:
%%time

data = data_raw.copy()# .head(1000).copy()

X_train, X_test_id0, X_test_id1, y_train, y_test, weights, data_test = preprocessing(data)

print(X_train.shape)
print(X_test_id0.shape)
print(X_test_id1.shape)
print(y_train.shape)
print(y_test.shape)



(12041815, 4813)
(1885670, 4813)
(1885670, 4813)
(12041815,)
(1885670,)
CPU times: total: 2min 19s
Wall time: 2min 20s


In [8]:
%%time

logreg = create_model(class_weight=weights, C=0.01, penalty="l2")
logreg.fit(X_train, y_train)

CPU times: total: 1min 55s
Wall time: 1min 19s


In [9]:
p0 = logreg.predict_proba(X_test_id0)[:, 1]
p1 = logreg.predict_proba(X_test_id1)[:, 1]

In [10]:
print(f'rocauc: {roc_auc_score(y_test, p0)}')

rocauc: 0.6462675354627002


# CIPS

находим вероятность, что одна с.в. больше другой.  
пользуемся свойствами нормального распределения.

In [12]:
pi0 = ss.norm.cdf(
    (data_test['coeff_sum0'] - data_test['coeff_sum1'] ) 
        / np.sqrt(data_test['g0'] ** 2 + data_test['g1'] ** 2)
)


pi1 = ss.norm.cdf(
    (logit(p0) - logit(p1)) 
        / np.sqrt(data_test['g0'] ** 2 + data_test['g1'] ** 2)
)

наконец находим cips

In [13]:
# чистим результат от мусора, который мешает посчитать cips (бесконечности и нуль в знаменателе)
only_goods_mask = (~np.isclose(pi0, 0.0, atol=1e-4)) & np.isfinite(pi0) & np.isfinite(pi1)
pi0 = pi0[only_goods_mask]
pi1 = pi1[only_goods_mask]

# возьмем лямбду равную 10 по условию задачи
lmbd = 10
cips = np.mean(np.clip(pi1 / pi0, a_min=None, a_max=lmbd) * y_test[only_goods_mask])
cips

0.0432459992614343