In [23]:
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

import os
os.environ['USER'] = 'xlearn'

In [149]:
%pip install xlearn

Note: you may need to restart the kernel to use updated packages.


In [24]:
data = pd.read_csv('../data/data.csv')
data.head(10)

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1
5,2021-09-27 00:06:50.000000,5,5,2377014068362699676,0,2,2,5,0.004,0.337634,-3.222757,37,0.004,0.338195,-3.221755,1,1
6,2021-09-27 00:07:34.000000,6,6,6863358899511896876,0,3,0,6,0.002,0.033805,-3.063872,29,0.002,0.037688,-3.111623,1,1
7,2021-09-27 00:08:49.000000,7,7,2876502170484631685,0,4,1,7,0.01,0.026041,-2.50906,11464231,0.45,0.020563,-2.753571,1,1
8,2021-09-27 00:09:08.000000,8,8,5839858970958967275,0,4,3,8,0.02,0.033933,-3.888843,243,0.02,0.03167,-3.923608,1,1
9,2021-09-27 00:09:16.000000,1,9,4868455078459394303,0,4,4,9,0.01,0.079909,-2.997711,11464232,0.67,0.022754,-1.840219,1,1


In [25]:
# удалим признаки, которые не нужны по условию
columns_to_drop = ['banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1']
data = data.drop(columns=columns_to_drop)

data['date_time'] = pd.to_datetime(data['date_time'])
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,0,1,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,3,0,0,1,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,0,1,1,1,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,0,1,0,1,1


In [26]:
def analysis(data: pd.DataFrame):
    # проверим вначале на наличие NaN-значений
    null_values = data.isnull().sum().sum()
    print(f"Count of NaN {null_values}")
    
    # посмотрим краткое описание данных, заметим, что impressions бесполезен, потому что константа
    print(data.describe())
    print()
    
    # посмотрим на временной интервал: 
    print(data['date_time'].dt.date.value_counts())
    print()
    # Заметим, что 2021-09-01 скорее всего лишний, потому что он в единственном экземпляре и выбивается из интервала
    
    # Посмотрим на категориальные фичи, непопулярные будем объединять в одну группу, совсем редкие будем удалять
    print(f"Unique {data['zone_id'].nunique()}")
    print(data['zone_id'].value_counts())
    print()
    # некоторых зон совсем мало, можно почистить
    
    print(f"Unique {data['oaid_hash'].nunique()}")
    print(data['oaid_hash'].value_counts())
    print()
    # oaid_hash трогать не будем TODO
    
    print(f"Unique {data['banner_id'].nunique()}")
    print(data['banner_id'].value_counts())
    print()
    # некоторых баннеров совсем мало, можно почистить
    
    print(f"Unique {data['os_id'].nunique()}")
    print(data['os_id'].value_counts())
    print()
    # с 7 по 10 можно объединить
    
    print(f"Unique {data['country_id'].nunique()}")
    print(data['country_id'].value_counts())
    print()
    # со странами все хорошо
    

In [27]:
analysis(data)

Count of NaN 0
            zone_id     banner_id     oaid_hash  campaign_clicks  \
count  1.582147e+07  1.582147e+07  1.582147e+07     1.582147e+07   
mean   8.152679e+01  3.816483e+02  4.610505e+18     6.238540e-01   
std    1.632448e+02  3.959386e+02  2.663858e+18     9.249152e+00   
min    0.000000e+00  0.000000e+00  1.116911e+12     0.000000e+00   
25%    1.400000e+01  5.200000e+01  2.297977e+18     0.000000e+00   
50%    1.900000e+01  2.170000e+02  4.614236e+18     0.000000e+00   
75%    6.000000e+01  6.110000e+02  6.914243e+18     0.000000e+00   
max    3.443000e+03  1.632000e+03  9.223371e+18     8.290000e+02   

              os_id    country_id  impressions        clicks  
count  1.582147e+07  1.582147e+07   15821472.0  1.582147e+07  
mean   1.840605e+00  4.346986e+00          1.0  2.668835e-02  
std    1.530005e+00  4.317701e+00          0.0  1.611710e-01  
min    0.000000e+00  0.000000e+00          1.0  0.000000e+00  
25%    1.000000e+00  0.000000e+00          1.0  0.000000e

In [28]:
categorical_features = ["zone_id", "banner_id", "oaid_hash" ,"os_id", "country_id", "hour"]

In [29]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    # производим чистку данных
    data = data.drop(columns=["impressions"])
    data = data[data['date_time'] > '2021-09-02']
    # добавим часы как признак
    data['hour'] = data['date_time'].dt.hour
    
    # То что ниже порога зануляем, объединяем некоторые признаки
    threshold = 0.01
    for field in categorical_features:
        if field == 'os_id':
            data.loc[data['os_id'] > 7, 'os_id'] = 7
        elif field != "oaid_hash":
            column = data[field].value_counts()
            total = column.sum()
            condition = column < total * 0.0001 
            mask_obs = column[condition].index
            mask = data[field].isin(mask_obs)
            data.loc[mask, field] = 0
            
    
    for field in categorical_features:
        data[field], _ = pd.factorize(data[field])
    return data

In [30]:
data = feature_engineering(data)
data

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,hour
0,2021-09-27 00:01:30,0,0,0,0,0,0,1,0
1,2021-09-26 22:54:49,1,1,1,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,2,3,0,0,1,2
3,2021-09-27 00:04:30,3,3,3,0,1,1,1,0
4,2021-09-27 00:06:21,4,4,4,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
15821467,2021-10-02 15:51:35,124,424,884244,0,2,9,0,8
15821468,2021-09-27 22:03:14,12,21,6510314,0,1,6,0,1
15821469,2021-10-02 17:41:10,12,713,1343205,0,2,0,0,19
15821470,2021-09-29 00:39:32,0,20,1138088,0,0,0,0,0


Делим выборку на тестовую, обучение и валидацию

In [31]:
data = data.sort_values("date_time")
max_date = data["date_time"].iloc[-1].date()
data_test = data[(data['date_time'].dt.date == max_date)]
data = data[(data['date_time'].dt.date < max_date)]
max_date

datetime.date(2021, 10, 2)

In [32]:
data = data.sort_values("date_time")
max_date = data["date_time"].iloc[-1].date()
data_val = data[(data['date_time'].dt.date == max_date)]
data = data[(data['date_time'].dt.date < max_date)]
max_date

datetime.date(2021, 10, 1)

In [33]:
data

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,hour
8423230,2021-09-26 00:00:00,0,143,4262597,0,1,12,0,0
14840335,2021-09-26 00:00:00,80,94,6246829,0,1,1,0,0
12701328,2021-09-26 00:00:00,180,20,5640288,0,0,0,0,0
13054066,2021-09-26 00:00:00,0,7,3473140,0,4,1,0,0
10652366,2021-09-26 00:00:00,2,178,5012265,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
11159519,2021-09-30 23:59:59,139,147,638058,0,0,1,0,2
3842833,2021-09-30 23:59:59,71,207,417101,0,3,0,0,2
6198955,2021-09-30 23:59:59,15,183,3422951,0,0,15,0,2
12346,2021-09-30 23:59:59,64,20,10899,0,0,0,1,2


In [34]:
data_val

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,hour
7465011,2021-10-01 00:00:00,14,0,890281,0,0,0,0,0
52924,2021-10-01 00:00:00,14,79,40552,0,2,3,1,0
4676591,2021-10-01 00:00:00,14,324,2774924,0,1,3,0,0
6811089,2021-10-01 00:00:00,139,46,2829244,0,0,0,0,0
11268423,2021-10-01 00:00:00,1,452,5206714,0,0,8,0,0
...,...,...,...,...,...,...,...,...,...
6809215,2021-10-01 23:59:59,197,96,3664278,0,4,1,0,2
9789489,2021-10-01 23:59:59,12,157,3479550,0,2,7,0,2
14917859,2021-10-01 23:59:59,17,72,2470551,0,1,3,0,2
10913431,2021-10-01 23:59:59,17,49,343914,1,2,5,0,2


In [35]:
data_test

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,hour
9767447,2021-10-02 00:00:00,0,72,3407921,0,1,3,0,0
13846765,2021-10-02 00:00:00,14,37,5970363,0,1,0,0,0
3091651,2021-10-02 00:00:00,70,7,2011699,0,4,1,0,0
10045990,2021-10-02 00:00:00,17,124,248745,0,2,0,0,0
9054327,2021-10-02 00:00:00,23,354,4482692,0,1,10,0,0
...,...,...,...,...,...,...,...,...,...
1745969,2021-10-02 23:59:59,0,717,1250821,0,6,0,0,2
13959634,2021-10-02 23:59:59,70,200,590233,2,3,0,0,2
13319080,2021-10-02 23:59:59,20,717,28576,0,6,0,0,2
3336944,2021-10-02 23:59:59,23,440,2137280,0,0,12,0,2


In [36]:
# приведем данные к формату libffm 
# https://github.com/ycjuan/libffm/blob/master/README
def transform_to_libffm(filename, data, target, dict_field, current_code=0):
    with open(filename, "w") as f:
        for index, row in tqdm(data.iterrows()):
            result_row = str(row[target])
            for i, x in enumerate(categorical_features):
                if (x not in dict_field):
                    dict_field[x] = {}
                    current_code += 1
                    dict_field[x][row[x]] = current_code
                elif (row[x] not in dict_field[x]):
                    current_code += 1
                    dict_field[x][row[x]] = current_code
                result_row += f' {i}:{dict_field[x][row[x]]}:1'
            result_row += '\n'
            f.write(result_row)
    return current_code, dict_field

Теперь с помощью нашей функции приведем данные к формату для библиотеки

In [37]:
current_code = 0
dict_field = {}
current_code, dict_field = transform_to_libffm("train.txt", data, 'clicks', dict_field, current_code)

12049045it [21:28, 9353.27it/s] 


In [38]:
current_code, dict_field = transform_to_libffm("test.txt", data_test, 'clicks', dict_field, current_code)

2128978it [03:39, 9687.32it/s] 


In [39]:
current_code, dict_field = transform_to_libffm("val.txt", data_val, 'clicks', dict_field, current_code)

1643448it [02:56, 9295.78it/s]


In [40]:
y_test = data_test['clicks']
y_val = data_val['clicks']

Добавим y_test, y_val, чтобы посчитать метрики у итоговой модели

Дальше сделаем кросс-валидацию по размеру эмбеддинга и регуляризации

In [43]:
import xlearn as xl
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss

for k in [2, 4, 8]:
    for l in [0.00001, 0.0001, 0.001]:
        print(f'k = {k}, lambda = {l}')
        ffm_model = xl.create_ffm()
        ffm_model.setTrain("train.txt")
        ffm_model.setTest("val.txt")
        param = {'task':'binary', 'lr': 0.1, 'lambda': l, 'k': k, 'metric': 'auc'}

        ffm_model.fit(param, './model.out')
        ffm_model.setSigmoid()
        ffm_model.predict('./model.out', './output.txt')

        with open('output.txt', 'r') as f:
            y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
        roc_auc_metric = roc_auc_score(y_val, y_pred_proba)
        log_loss_metric = log_loss(y_val, y_pred_proba)
        print(f'roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}')


k = 2, lambda = 1e-05
roc_auc = 0.7229546202966786, log_loss = 0.1978726188503258
k = 2, lambda = 0.0001
roc_auc = 0.7786576853301896, log_loss = 0.16060868457494457
k = 2, lambda = 0.001
roc_auc = 0.7981328098433704, log_loss = 0.15307253862701306
k = 4, lambda = 1e-05
roc_auc = 0.7173408718200777, log_loss = 0.20513695607106175
k = 4, lambda = 0.0001
roc_auc = 0.7892462227054867, log_loss = 0.15986964446827562
k = 4, lambda = 0.001
roc_auc = 0.7990586168741912, log_loss = 0.15302002313741653
k = 8, lambda = 1e-05
roc_auc = 0.7529677354906678, log_loss = 0.2008761226840055
k = 8, lambda = 0.0001
roc_auc = 0.7735381330182368, log_loss = 0.16521077575074392
k = 8, lambda = 0.001
roc_auc = 0.7996903902527646, log_loss = 0.152736369526566


In [44]:
best_k = 4
best_lambda = 0.001

Возьмем лучшие значения. Удалось сильно улучшиться
Было:

Linear: roc_auc = 0.7231544152919693, log_loss = 0.14187426057111957

Baseline: roc_auc = 0.5, log_loss = 0.15486198009919555

In [46]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("train.txt")
ffm_model.setTest("test.txt")
param = {'task':'binary', 'lr': 0.1, 'lambda': best_lambda, 'k': best_k, 'metric': 'auc'}

ffm_model.fit(param, './model.out')
ffm_model.setSigmoid()
ffm_model.predict('./model.out', './output.txt')

with open('output.txt', 'r') as f:
    y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
roc_auc_metric = roc_auc_score(y_test, y_pred_proba)
log_loss_metric = log_loss(y_test, y_pred_proba)
print(f'roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}')

roc_auc = 0.7827924877047462, log_loss = 0.13691093308613123
