In [1]:
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('../data/data.csv')
data.head(10)

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1
5,2021-09-27 00:06:50.000000,5,5,2377014068362699676,0,2,2,5,0.004,0.337634,-3.222757,37,0.004,0.338195,-3.221755,1,1
6,2021-09-27 00:07:34.000000,6,6,6863358899511896876,0,3,0,6,0.002,0.033805,-3.063872,29,0.002,0.037688,-3.111623,1,1
7,2021-09-27 00:08:49.000000,7,7,2876502170484631685,0,4,1,7,0.01,0.026041,-2.50906,11464231,0.45,0.020563,-2.753571,1,1
8,2021-09-27 00:09:08.000000,8,8,5839858970958967275,0,4,3,8,0.02,0.033933,-3.888843,243,0.02,0.03167,-3.923608,1,1
9,2021-09-27 00:09:16.000000,1,9,4868455078459394303,0,4,4,9,0.01,0.079909,-2.997711,11464232,0.67,0.022754,-1.840219,1,1


In [3]:
# удалим признаки, которые не нужны по условию
columns_to_drop = ['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1']
data = data.drop(columns=columns_to_drop)

data['date_time'] = pd.to_datetime(data['date_time'])
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1,1
1,2021-09-26 22:54:49,1,1,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,3,0,0,1,1
3,2021-09-27 00:04:30,3,3,0,1,1,1,1
4,2021-09-27 00:06:21,4,4,0,1,0,1,1


In [4]:
def analysis(data: pd.DataFrame):
    # проверим вначале на наличие NaN-значений
    null_values = data.isnull().sum().sum()
    print(f"Count of NaN {null_values}")
    
    # посмотрим краткое описание данных, заметим, что impressions бесполезен, потому что константа
    print(data.describe())
    print()
    
    # посмотрим на временной интервал: 
    print(data['date_time'].dt.date.value_counts())
    print()
    # Заметим, что 2021-09-01 скорее всего лишний, потому что он в единственном экземпляре и выбивается из интервала
    
    # Посмотрим на категориальные фичи, непопулярные будем объединять в одну группу, совсем редкие будем удалять
    print(f"Unique {data['zone_id'].nunique()}")
    print(data['zone_id'].value_counts())
    print()
    # некоторых зон совсем мало, можно почистить
    
    print(f"Unique {data['banner_id'].nunique()}")
    print(data['banner_id'].value_counts())
    print()
    # некоторых баннеров совсем мало, можно почистить
    
    print(f"Unique {data['os_id'].nunique()}")
    print(data['os_id'].value_counts())
    print()
    # с 7 по 10 можно объединить
    
    print(f"Unique {data['country_id'].nunique()}")
    print(data['country_id'].value_counts())
    print()
    # со странами все хорошо
    

In [5]:
analysis(data)

Count of NaN 0
            zone_id     banner_id  campaign_clicks         os_id  \
count  1.582147e+07  1.582147e+07     1.582147e+07  1.582147e+07   
mean   8.152679e+01  3.816483e+02     6.238540e-01  1.840605e+00   
std    1.632448e+02  3.959386e+02     9.249152e+00  1.530005e+00   
min    0.000000e+00  0.000000e+00     0.000000e+00  0.000000e+00   
25%    1.400000e+01  5.200000e+01     0.000000e+00  1.000000e+00   
50%    1.900000e+01  2.170000e+02     0.000000e+00  2.000000e+00   
75%    6.000000e+01  6.110000e+02     0.000000e+00  3.000000e+00   
max    3.443000e+03  1.632000e+03     8.290000e+02  1.000000e+01   

         country_id  impressions        clicks  
count  1.582147e+07   15821472.0  1.582147e+07  
mean   4.346986e+00          1.0  2.668835e-02  
std    4.317701e+00          0.0  1.611710e-01  
min    0.000000e+00          1.0  0.000000e+00  
25%    0.000000e+00          1.0  0.000000e+00  
50%    4.000000e+00          1.0  0.000000e+00  
75%    7.000000e+00          

In [6]:
categorical_features = ["zone_id", "banner_id", "os_id", "country_id", "hour"]

In [7]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    # производим чистку данных
    data = data.drop(columns=["impressions"])
    data = data[data['date_time'] > '2021-09-01']
    # добавим часы как признак
    data['hour'] = data['date_time'].dt.hour
    
    # То что ниже порога зануляем, объединяем некоторые признаки
    threshold = 0.01
    for field in categorical_features:
        if field == 'os_id':
            data.loc[data['os_id'] < 7, 'os_id'] = 7
        else:
            column = data[field].value_counts()
            total = column.sum()
            condition = column < total * 0.01  
            mask_obs = column[condition].index
            mask = data[field].isin(mask_obs)
            data.loc[mask, field] = 0
        
    # разделение на train - test, оставляем последний день в test
    X_train = data[data['date_time'] < '2021-10-02']
    X_test = data[data['date_time'] >= '2021-10-02']
    y_train = X_train['clicks']
    y_test = X_test['clicks']
    X_train = X_train.drop(columns=['date_time', "clicks"])
    X_test = X_test.drop(columns=['date_time', "clicks"])

    # делаем ван хот энкодинг категориальных фичей, с ними и обучаемся
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train = ohe.fit_transform(X_train[categorical_features])
    X_test = ohe.transform(X_test[categorical_features])
    return X_train, y_train, X_test, y_test

In [8]:
X_train, y_train, X_test, y_test = feature_engineering(data)

In [9]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
X_train

(13692494, 83) (13692494,)
(2128978, 83) (2128978,)


<13692494x83 sparse matrix of type '<class 'numpy.float64'>'
	with 68462470 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.linear_model import LogisticRegression

# Используем логистическую регрессию с 'liblinear', упомянутую на лекции, он не является SGD, 
# так же взяли l2 регуляризацию
def create_model(C=0.01):
    return LogisticRegression(solver='liblinear', C=C, penalty="l2")

In [15]:
from sklearn.model_selection import cross_validate
import numpy as np

# Кросс-валидация, параметр C перебираем
def cv(X_train, y_train):
    for C in [0.01, 0.1, 1]:
        model = create_model(C=C)
        scores = cross_validate(model, X_train, y_train, scoring=['neg_log_loss', 'roc_auc'], cv=5, n_jobs=6)
        print(f"C = {C}, neg_los_loss = {np.mean(scores['test_neg_log_loss'])}, roc_auc = {np.mean(scores['test_roc_auc'])}")


In [16]:
cv(X_train, y_train)

C = 0.01, neg_los_loss = -0.1236870709315907, roc_auc = 0.5539889975675472
C = 0.1, neg_los_loss = -0.12525274885119128, roc_auc = 0.5542056681512906
C = 1, neg_los_loss = -0.12562948167915905, roc_auc = 0.5546731219737013


In [19]:
# Сравниваем 2 модели, получаем, что наша стала лучше базового решения
from sklearn.metrics import roc_auc_score, log_loss

model = create_model(C=0.01).fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc_metric = roc_auc_score(y_test, y_pred_proba)
log_loss_metric = log_loss(y_test, y_pred_proba)
print(f"roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}")

roc_auc = 0.7231544152919693, log_loss = 0.14187426057111957


In [20]:
# Бейзлайн (берем среднее)

y_pred_baseline = np.full(y_pred_proba.shape, np.mean(y_train))
roc_auc_metric_baseline = roc_auc_score(y_test, y_pred_baseline)
log_loss_metric_baseline = log_loss(y_test, y_pred_baseline)
print(f"roc_auc = {roc_auc_metric_baseline}, log_loss = {log_loss_metric_baseline}")

roc_auc = 0.5, log_loss = 0.15486198009919555
