In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder


In [2]:
data = pd.read_csv('../data/data.csv').sample(frac=0.01, random_state=1)
data.head(10)

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
8346547,2021-09-27 14:40:08.000000,17,622,3893226935960107276,0,4,7,622,0.05,0.083065,-3.741721,650,0.05,0.075785,-3.726563,1,0
9394680,2021-09-26 18:46:37.000000,19,1235,7846425171453325314,0,5,0,1235,0.067,0.017585,-6.087682,1240,0.067,0.017566,-6.090274,1,0
13024376,2021-10-02 23:09:07.000000,315,116,9121473753705184179,0,0,12,116,0.001,0.247624,-4.572262,594,0.003,0.239873,-5.467989,1,0
15806667,2021-09-29 14:27:36.000000,14,671,1660532342692348167,0,1,0,671,0.009,0.011121,-4.191673,14620708,1.4,0.00598,-3.556881,1,0
14459047,2021-09-29 19:41:17.000000,14,63,722574788548885921,2,2,0,63,0.03,0.033995,-3.893601,40,0.03,0.039212,-3.97383,1,0
3825241,2021-09-28 18:40:11.000000,17,52,5334760109874209075,0,2,5,52,0.008,0.009826,-3.960855,12231176,0.137,0.0,0.0,1,0
10129887,2021-09-26 15:14:34.000000,113,16,3542127356391604262,0,1,12,16,0.004,0.035155,-4.533533,22,0.004,0.034615,-4.563424,1,0
8479671,2021-09-27 23:32:36.000000,76,29,4064698097986638679,0,3,0,29,0.002,0.018906,-3.129625,6,0.002,0.01813,-3.140339,1,0
13549374,2021-10-02 20:54:51.000000,143,89,8281971653706806920,0,4,1,89,0.01,0.037451,-2.536086,14171614,8.0,0.045791,-2.76704,1,0
6952612,2021-09-26 04:16:46.000000,19,3,6600244079662587812,32,2,1,3,0.012,0.009809,-5.032157,58,0.003,0.049213,-3.978318,1,0


Сразу удалим ненужные по условию фичи.
Преобразуем дату в удобный тип.

In [3]:
data = data.drop(
    columns=["oaid_hash", "banner_id0", "banner_id1", "rate0", "rate1", "g0", "g1", "coeff_sum0", "coeff_sum1"])
data['date_time'] = pd.to_datetime(data['date_time'])

Посмотрим сколько уникальных значений каждого признака.
Заметим, что impressions - константа. Удалим его.

In [4]:
print(data.nunique())

date_time          136324
zone_id              1086
banner_id            1153
campaign_clicks       202
os_id                   9
country_id             17
impressions             1
clicks                  2
dtype: int64


In [5]:
data = data.drop(columns=["impressions"])

Выделим в valid_raw часть последний день, в train_raw - все кроме последнего.
train_c - категориальные признаки из train_raw,  valid_c - категориальные признаки из valid_raw

In [6]:
train_raw = data[data['date_time'].dt.date != data['date_time'].dt.date.max()]
valid_raw = data[data['date_time'].dt.date == data['date_time'].dt.date.max()]
train_c = train_raw.drop(columns=['clicks', 'campaign_clicks', 'date_time'])
valid_c = valid_raw.drop(columns=['clicks', 'campaign_clicks', 'date_time'])

Закодируем категории с помощью One-Hot-Encoding

In [7]:
ohe = OneHotEncoder(handle_unknown='ignore').fit(train_c)

In [8]:
train_c_ohe_matrix, valid_c_ohe_matrix = ohe.transform(train_c), ohe.transform(valid_c)

In [9]:
col_names = [f'f_{i}' for i in range(train_c_ohe_matrix.shape[1])]
train_c_ohe = pd.DataFrame.sparse.from_spmatrix(train_c_ohe_matrix, columns=col_names)
valid_c_ohe = pd.DataFrame.sparse.from_spmatrix(valid_c_ohe_matrix, columns=col_names)

чтобы не было слишком много признаков, сделаем фильтрацию с помощью статистического теста, оставим 1000 признаков.

так же я пробовал перед данным этапом добавить попарные комбинации признаков (добавить полиномиальные признаки), но на результат это почти не повлияло. тут без них

In [10]:
selector = SelectKBest(k=1 * 10 ** 3).fit(train_c_ohe, train_raw['clicks'])
cols = list(selector.get_feature_names_out())
train_c_filtered = train_c_ohe[cols]
valid_c_filtered = valid_c_ohe[cols]

пробовал добавить к категориальным 'campaign_clicks' и 'date_time' но на результат это почти не повлияло. тут без них

In [11]:
x_train = train_c_filtered
y_train = train_raw['clicks']

x_valid = valid_c_filtered
y_valid = valid_raw['clicks']

print(f'{x_train.shape}  {y_train.shape}')
print(f'{x_valid.shape}  {y_valid.shape}')

(137221, 1000)  (137221,)
(20994, 1000)  (20994,)


Возьмем логистическую регрессию с солвером liblinear (не SGD).

In [12]:
def get_model(c):
    return LogisticRegression(solver='liblinear', C=c, random_state=0)

Посчитаем метрики 'neg_log_loss', 'roc_auc', 'accuracy' для разных коэффициентов регуляризации.
Для каждого коэффициента делаем кросс валидацию и смотрим на среднее значение метрики.

In [14]:
cs = [10 ** i for i in range(-2, 3)]

max_roc = -1
best_c = -1
for c in cs:
    model = get_model(c)
    scores = cross_validate(model, x_train, y_train, scoring=['neg_log_loss', 'roc_auc', 'accuracy'], cv=4, n_jobs=6)
    if max_roc < np.mean(scores['test_roc_auc']):
        max_roc = np.mean(scores['test_roc_auc'])
        best_c = c
    print(f"c = {c}")
    print(f"ll = {-np.mean(scores['test_neg_log_loss'])}")
    print(f"roc = {np.mean(scores['test_roc_auc'])}")
    print(f"acc = {np.mean(scores['test_accuracy'])}")
    print(f"--------------------------------------")

print(f'the best C = {best_c}')

c = 0.01
ll = 0.11249175986058639
roc = 0.6948644536317352
acc = 0.9745155626420524
--------------------------------------
c = 0.1
ll = 0.10873682250213333
roc = 0.7311772064193853
acc = 0.9745155626420524
--------------------------------------
c = 1
ll = 0.10714499107872215
roc = 0.7460277966515938
acc = 0.9746030130261156
--------------------------------------
c = 10
ll = 0.10848929232803671
roc = 0.7415974510066349
acc = 0.9745155634917659
--------------------------------------
c = 100
ll = 0.11132769858205033
roc = 0.7338563606524351
acc = 0.9744208257527209
--------------------------------------
the best C = 1


Проверим модель на valid части

In [15]:
model = get_model(best_c)
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)
auc_model = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
ll_model = log_loss(y_valid, model.predict_proba(x_valid))
print(f'auc = {auc_model}')
print(f'log_loss = {ll_model}')

auc = 0.7753422095428824
log_loss = 0.1327882488762714


Посчитаем метрики для бейзлайна (просто среднее).

In [16]:
y_pred_base = np.full(y_valid.shape, np.mean(y_train))
auc_base = roc_auc_score(y_valid, y_pred_base)
ll_base = log_loss(y_valid, y_pred_base)
print(f'auc = {auc_base}')
print(f'log_loss = {ll_base}')

auc = 0.5
log_loss = 0.1533867776563465


Мы победили бейзлайн