подключим библиотеки, загрузим датасет

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
!pip3 install xlearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import sys
import os
import numpy as np
import pandas as pd
import xlearn as xl
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss
os.environ['USER'] = 'xlearn'

In [4]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name
train_file = os.path.join(data_path, "train.txt")
valid_file = os.path.join(data_path, "valid.txt")
test_file = os.path.join(data_path, "test.txt")
model_file = os.path.join(data_path, "model.out")
output_file = os.path.join(data_path, "output.txt")


In [5]:
data = pd.read_csv('/gdrive/MyDrive/data/data.csv').sample(frac=0.1, random_state=1)

Сразу удалим ненужные по условию фичи. А так же  "impressions", "campaign_clicks" .
Преобразуем дату в удобный тип.

In [6]:
data = data.drop(columns=["banner_id0", "banner_id1", "rate0", "rate1", "g0", "g1", "coeff_sum0", "coeff_sum1", "impressions"])
#, "campaign_clicks"
data['date_time'] = pd.to_datetime(data['date_time'])

Выделим в test_raw часть последний день, в train_valid_raw - все кроме последнего. 

разобьем train_valid_raw на train, valid .

Сделаем чтобы 'clicks' был первой колонкой. Удалим "date_time"

In [7]:
train_valid_raw = data[data['date_time'].dt.date != data['date_time'].dt.date.max()]
test_raw = data[data['date_time'].dt.date == data['date_time'].dt.date.max()]

train_valid = pd.concat([train_valid_raw['clicks'], train_valid_raw.drop(columns=['date_time', 'clicks'])], axis=1)
train, valid = train_test_split(train_valid, test_size=0.25, random_state=42)


test = pd.concat([test_raw['clicks'], test_raw.drop(columns=['date_time', 'clicks'])], axis=1)

Переведем наши dataframe в libffm формат

In [8]:
class LibFFMEncoder(object):
    def __init__(self):
        self.en = {}

    def encode(self, row):
        new_row = f"{row[0]}"
        for i, r in enumerate(row[1:]):
            if i not in self.en:
              self.en[i] = {}
            if r not in self.en[i]:
              self.en[i][r] = len(self.en[i])
            new_row += f' {i+1}:{self.en[i][r]}:1'
        return new_row

In [9]:
encoder = LibFFMEncoder()
train_ffm = train.apply(lambda row: encoder.encode(row), raw=True, axis=1)
valid_ffm = valid.apply(lambda row: encoder.encode(row), raw=True, axis=1)
test_ffm = test.apply(lambda row: encoder.encode(row), raw=True, axis=1)

сохраним все в файлы. (нужно для xlearn)

In [10]:
train_ffm.to_csv(train_file, index=False, header=False)
valid_ffm.to_csv(valid_file, index=False, header=False)
test_ffm.to_csv(test_file, index=False, header=False)

попробуем перебрать параметры. 
к - длина вектора

In [11]:
param_dict = {
    "lr": [0.1, 0.2],
    "lambda": [0.0001, 0.001],
    "k" : [3, 5, 7, 16]
}
param_grid = list(ParameterGrid(param_dict))
#param_grid

In [12]:
max_auc = -1
max_ll = -1
max_param = None
for param in param_grid:
    param.update({'task':'binary', 'metric': 'auc'})
    ffm_model = xl.create_ffm()       
    ffm_model.setTrain(train_file)     
    ffm_model.setValidate(valid_file)
    ffm_model.fit(param, model_file)

    ffm_model.setTest(valid_file)  
    ffm_model.setSigmoid()        
    ffm_model.predict(model_file, output_file)

    with open(output_file, 'r') as f:
      y_pred_proba = np.array([float(prediction) for prediction in f.readlines()])

    roc_auc_metric = roc_auc_score(valid['clicks'], y_pred_proba)
    log_loss_metric = log_loss(valid['clicks'], y_pred_proba)
    print(f"{param} ### roc_auc={roc_auc_metric}, log_loss={log_loss_metric}")
    if max_auc < roc_auc_metric:
      max_auc = roc_auc_metric
      max_ll = log_loss_metric
      max_param = param

{'k': 3, 'lambda': 0.0001, 'lr': 0.1, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7767756273892892, log_loss=0.10256103334432431
{'k': 3, 'lambda': 0.0001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.780439920697833, log_loss=0.10090312457540034
{'k': 3, 'lambda': 0.001, 'lr': 0.1, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7430963112800254, log_loss=0.10771592133084171
{'k': 3, 'lambda': 0.001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7476108978469318, log_loss=0.10670746055333855
{'k': 5, 'lambda': 0.0001, 'lr': 0.1, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7790240891264972, log_loss=0.10229413681426787
{'k': 5, 'lambda': 0.0001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7850039159774473, log_loss=0.10057416415564484
{'k': 5, 'lambda': 0.001, 'lr': 0.1, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7434351373229462, log_loss=0.10768652511814017
{'k': 5, 'lambda': 0.001, 'lr': 0.2, 'task': 'binary', 'metric': '

In [13]:
print(f"{max_param} ### roc_auc={max_auc}, log_loss={max_ll}")

{'k': 16, 'lambda': 0.0001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} ### roc_auc=0.7878045334807533, log_loss=0.10050004081505186


In [None]:
pp = {'k': 5, 'lambda': 0.0001, 'lr': 0.1, 'task': 'binary', 'metric': 'auc'}

In [16]:
ffm_model = xl.create_ffm()       
ffm_model.setTrain(train_file)     
ffm_model.setValidate(valid_file)
ffm_model.fit(max_param, model_file)

ffm_model.setTest(test_file)  
ffm_model.setSigmoid()        
ffm_model.predict(model_file, output_file)

with open(output_file, 'r') as f:
  y_pred_proba = np.array([float(prediction) for prediction in f.readlines()])

roc_auc_metric = roc_auc_score(test['clicks'], y_pred_proba)
log_loss_metric = log_loss(test['clicks'], y_pred_proba)
print(f"{param} \n roc_auc={roc_auc_metric}, log_loss={log_loss_metric}")

{'k': 16, 'lambda': 0.001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} 
 roc_auc=0.7789462529724076, log_loss=0.13695412614481403


# результат выше оказался лучше, чем бейзлайн и чем первая домашка

In [15]:
y_pred_proba = np.full(y_pred_proba.shape, np.mean(train['clicks']))
roc_auc_metric = roc_auc_score(test['clicks'], y_pred_proba)
log_loss_metric = log_loss(test['clicks'], y_pred_proba)
print(f"{param} \n roc_auc={roc_auc_metric}, log_loss={log_loss_metric}")

{'k': 16, 'lambda': 0.001, 'lr': 0.2, 'task': 'binary', 'metric': 'auc'} 
 roc_auc=0.5, log_loss=0.15665385637598783
