# Предсказание вероятности клика на банер FFM


Лучше всего показало себя обучение на одном дне. В противном случае модель очень сильно переобучается (это может быть следствие разных причин, например, того, что дни друг от друга сильно отличаются, а у нас данные только за одну неделю).

* Модель logreg с фичами взаимодействий и target encoding
    * best_params = {'params': {'C': 0.5, 'max_iter': 300, 'random_state': 1}
    * test log_loss with best params =  0.13998072945658324
* Модель FFM (x-learn):
    * best_params = {'epoch': , 'k': 10, 'lambda': 0.01, 'lr': 0.01, 'metric': 'auc', 'task': 'binary'}
    * test log_loss with best params =  0.148402
    
В итоге видно, что использование FFM из X-learn не дало улучшение в метрике log-loss


Есть несколько основных причин: 
1) слишком мало данных

2) не оптимальные гиперпараметры

3) нужен более тщательный feature engineering

In [1]:
import pandas as pd
import xlearn as xl
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import log_loss

# Load data

In [8]:
cols_to_use = ['date_time', 'zone_id', 'banner_id', 'oaid_hash', 'campaign_clicks', 'os_id', 'country_id', 'clicks']

data = pd.read_csv('../../data/data.csv',
                parse_dates=['date_time'], 
                usecols=cols_to_use
                   ,)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,0,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,0,1,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,3,0,0,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,0,1,1,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,0,1,0,1


In [9]:
data.sort_values('date_time', inplace=True)

In [10]:
data.date_time.dt.date.value_counts()

2021-09-26    3102610
2021-09-29    2420588
2021-09-27    2367303
2021-09-28    2307355
2021-10-02    2128978
2021-09-30    1851189
2021-10-01    1643448
2021-09-01          1
Name: date_time, dtype: int64

# Train on all previous days

## Process data and Feature Engineering

### Feature Engineering

In [11]:
# drop 2021-09-01 т.к. есть только один сэмпл с этой датой (outlier)
data = data[1:]

In [12]:
def get_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    get time features
    """
    df['hour'] = df['date_time'].dt.hour
    return df


def featurize_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    """
    pipeline for the feature preparation
    """
    featurized_df = df.copy()
    featurized_df = get_time_features(featurized_df)
    return featurized_df



In [13]:
data = featurize_pipeline(data)

In [14]:
data['date'] = data['date_time'].dt.date

In [15]:
data.reset_index(inplace=True, drop=True)

In [16]:
def get_indexes_timebased_validation(df: pd.DataFrame, dates: list):
    """
    get validation splits
    """
    last_index_train = df[df['date'] < dates[-2]].index.max()
    last_index_val = df[df['date'] == dates[-2]].index.max()
    return last_index_train, last_index_val


In [11]:
last_index_train, last_index_val = get_indexes_timebased_validation(data, dates=sorted(data.date.unique()))

### Prepare data to model

In [12]:
import math


def _convert_to_ffm(path, df, type, target, numerics, categories, encoder):
    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    # print('Encoder Summary:')
    # print(json.dumps(encoder, indent=4))
    return encoder

In [13]:
data.columns

Index(['date_time', 'zone_id', 'banner_id', 'oaid_hash', 'campaign_clicks',
       'os_id', 'country_id', 'clicks', 'hour', 'date'],
      dtype='object')

In [14]:
# select params for data transformation
TARGET = ['clicks']
NUMERICAL_FEATURES = ['campaign_clicks']
CATEGORICAL_FEATURES = ['zone_id', 'banner_id', 'oaid_hash', 'os_id', 'country_id',  'hour']
NUM_THREADS = 8

In [15]:
encoder = {"currentcode": len(NUMERICAL_FEATURES),
           "catdict": {},
           "catcodes": {}}

In [16]:
FEATURES = TARGET + NUMERICAL_FEATURES + CATEGORICAL_FEATURES

In [19]:
train_df = data.iloc[:last_index_train+1][FEATURES]
val_df = data.iloc[last_index_train+1: last_index_val+1][FEATURES]
test_df = data.iloc[last_index_val+1:][FEATURES]

In [20]:
train_df.head()

Unnamed: 0,clicks,campaign_clicks,zone_id,banner_id,oaid_hash,os_id,country_id,hour
0,0,1,41,29,1834033519797437404,3,0,0
1,0,2,1,188,7416450538971744701,2,15,0
2,0,2,17,52,1832228443297591417,2,5,0
3,0,1,47,73,4180077124914749282,4,13,0
4,0,1,48,266,1459689388363839798,0,1,0


In [47]:
#converte data to xlearn format

encoder = _convert_to_ffm('data/', 
                          train_df,
                         'train',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

encoder = _convert_to_ffm('data/', 
                          val_df,
                         'val',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

encoder = _convert_to_ffm('data/', 
                          test_df,
                         'test',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

convert_to_ffm - START
convert_to_ffm - START
convert_to_ffm - START


## Train Model

### fit model

In [37]:
param = {'epoch': 30,
            'k': 2,
            'task': 'binary',
            'lr': 0.1,
            'lambda': 0.5}
print(f"current param = {param}")
ffm_model = xl.create_ffm()
ffm_model.setTrain("data/train_ffm.txt")
ffm_model.setValidate("data/val_ffm.txt")
ffm_model.fit(param, f"data/model_all_days.xlearn")
print(['*'] * 20)

current param = {'epoch': 30, 'k': 2, 'task': 'binary', 'lr': 0.1, 'lambda': 0.5}
[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/val_ffm.txt.bin) found. Skip convertin

# Train on previous day

## Process data and Feature Engineering

### Feature Engineering

In [17]:
def get_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    get time features
    """
    df['hour'] = df['date_time'].dt.hour
    return df


def featurize_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    """
    pipeline for the feature preparation
    """
    featurized_df = df.copy()
    featurized_df = get_time_features(featurized_df)
    return featurized_df



In [18]:
data = featurize_pipeline(data)

In [19]:
dates = sorted(data.date.unique())

### Prepare data to model

In [20]:
import math


def _convert_to_ffm(path, df, type, target, numerics, categories, encoder):
    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    # print('Encoder Summary:')
    # print(json.dumps(encoder, indent=4))
    return encoder

In [21]:
data.columns

Index(['date_time', 'zone_id', 'banner_id', 'oaid_hash', 'campaign_clicks',
       'os_id', 'country_id', 'clicks', 'hour', 'date'],
      dtype='object')

In [22]:
# select params for data transformation
TARGET = ['clicks']
NUMERICAL_FEATURES = ['campaign_clicks']
CATEGORICAL_FEATURES = ['zone_id', 'banner_id', 'oaid_hash', 'os_id', 'country_id',  'hour']
NUM_THREADS = 8

In [23]:
encoder = {"currentcode": len(NUMERICAL_FEATURES),
           "catdict": {},
           "catcodes": {}}

In [24]:
FEATURES = TARGET + NUMERICAL_FEATURES + CATEGORICAL_FEATURES

In [25]:
# Делать валидацию по дням с усреднением результатов слишком вычислительно дорого, поэтому так
train_df = data[data.date==dates[-3]][FEATURES]
val_df = data[data.date==dates[-2]][FEATURES]
test_df = data[data.date==dates[-1]][FEATURES]

In [26]:
train_df.head()

Unnamed: 0,clicks,campaign_clicks,zone_id,banner_id,oaid_hash,os_id,country_id,hour
10197856,0,0,17,52,1995762422249821197,2,5,0
10197857,0,0,14,24,845498275380046131,4,1,0
10197858,0,1,43,115,1439329130559143244,0,14,0
10197859,0,3,27,2,1248052718144148612,0,0,0
10197860,0,0,11,28,8031738334595892994,2,5,0


In [27]:
#converte data to xlearn format

encoder = _convert_to_ffm('data/', 
                          train_df,
                         'prev_train',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

encoder = _convert_to_ffm('data/', 
                          val_df,
                         'prev_val',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

encoder = _convert_to_ffm('data/', 
                          test_df,
                         'prev_test',
                          TARGET[0],
                          NUMERICAL_FEATURES,
                          CATEGORICAL_FEATURES,
                          encoder)

convert_to_ffm - START
convert_to_ffm - START
convert_to_ffm - START


## Train Model

### fit model

In [28]:
# результаты выглядят гораздо адекватнее, чем при обучении на всех днях. Можно попробовать поиск гиперпараметров
param = {'epoch': 30,
            'k': 2,
            'task': 'binary',
            'lr': 0.1,
            'lambda': 0.5}
print(f"current param = {param}")
ffm_model = xl.create_ffm()
ffm_model.setTrain("data/prev_train_ffm.txt")
ffm_model.setValidate("data/prev_val_ffm.txt")
ffm_model.fit(param, f"data/model_prev.xlearn")
print(['*'] * 20)

current param = {'epoch': 30, 'k': 2, 'task': 'binary', 'lr': 0.1, 'lambda': 0.5}
[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/prev_train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/prev_val_ffm.txt.bin) found. Skip

In [29]:
# перебор небольшого кол-ва параметров, т.к. слишком долгий расчет
params = {
    'epoch': [40], # early stopping by default
    'k': sorted(range(5, 11, 5)),
    'task':['binary'],
    'lr':[0.001, 0.01],
    'lambda':[0.05],
}

In [30]:
for id, param in enumerate(ParameterGrid(params)):
    print(f"current param = {param} and id={id}")
    ffm_model = xl.create_ffm()
    ffm_model.setTrain("data/prev_train_ffm.txt")
    ffm_model.setValidate("data/prev_val_ffm.txt")
    ffm_model.fit(param, f"data/model_prev_{id}.xlearn")
    print(['*'] * 20)

current param = {'epoch': 40, 'k': 5, 'lambda': 0.05, 'lr': 0.001, 'task': 'binary'} and id=0
[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/prev_train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/prev_val_ffm.txt.bin)

[32m[------------] [0mModel file: data/model_prev_2.xlearn
[32m[------------] [0mTime cost for saving model: 2.11 (sec)
[32m[1m[ ACTION     ] Finish training[0m
[32m[1m[ ACTION     ] Clear the xLearn environment ...[0m
['*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'][32m[1m[------------] Total time cost: 15.67 (sec)[0m

current param = {'epoch': 40, 'k': 10, 'lambda': 0.05, 'lr': 0.01, 'task': 'binary'} and id=3
[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     

# Refit model with the best params

In [2]:
best_params = {'epoch': 9, 'k': 10, 'lambda': 0.05, 'lr': 0.01, 'task': 'binary'}
ffm_model = xl.create_ffm()
ffm_model.setTrain("data/prev_val_ffm.txt")
ffm_model.fit(best_params, 'data/model.xlearn')

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (data/prev_val_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 1706179
[32m[------------] [0mNumber of Field: 7
[32m[------------] [0mTime cost for reading problem: 1.12 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.08 GB
[32m[-------

In [3]:
ffm_model.setTest("data/prev_test_ffm.txt")

In [4]:
ffm_model.predict("data/model.xlearn", "predict.txt")

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from data/model.xlearn
[32m[------------] [0mLoss function: cross-entropy
[32m[------------] [0mScore function: ffm
[32m[------------] [0mNumber of Feature: 1706179
[32m[------------] [0mNumber of K: 10
[32m[------------] [0mNumber of field: 7
[32m[------------] [0mTime cost for loading model: 0.57 (sec)
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has bee