In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/data.csv')
data.head()

drop_columns = ['banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1']
data = data.drop(columns=drop_columns)

In [3]:
data.describe()

Unnamed: 0,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
count,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821472.0,15821470.0
mean,81.52679,381.6483,4.610505e+18,0.623854,1.840605,4.346986,1.0,0.02668835
std,163.2448,395.9386,2.663858e+18,9.249152,1.530005,4.317701,0.0,0.161171
min,0.0,0.0,1116911000000.0,0.0,0.0,0.0,1.0,0.0
25%,14.0,52.0,2.297977e+18,0.0,1.0,0.0,1.0,0.0
50%,19.0,217.0,4.614236e+18,0.0,2.0,4.0,1.0,0.0
75%,60.0,611.0,6.914243e+18,0.0,3.0,7.0,1.0,0.0
max,3443.0,1632.0,9.223371e+18,829.0,10.0,16.0,1.0,1.0


In [4]:
def analysis(data: pd.DataFrame):
    # По data.describe заметим, что все значения impressions = 1, значит колонка бесполезна
    # Посчитаем в каких временных интервалах лежат наши данные
    print("Dates:", sorted(set(data['date_time'].map(lambda s: s[:10]))))
    # Заметим лишний день в нашей неделе - 1 сентября
    
    # Посмотрим на пропущенные значения
    print("Count of null data: ", data.isnull().sum().sum())
    # Пропущенных данных нет
    
    # Проверим, что все категориальные фичи представленны в достаточном количестве
    print(data['os_id'].value_counts())
    # Заметим, что некоторые os-и не популярны: 8, 7, 9, 10. Объединим их в одну группу
    print(data['country_id'].value_counts())
    # Страны представлены в достаточном количестве
    print(data['zone_id'].value_counts())
    # Среди мест баннера есть непопулярные, закодируем их в одну группу
    print(data['banner_id'].value_counts())
    # Среди банеров есть как очень популярные, по которым много информации, также много тех, что встречаются 1 раз во всем датасете, то есть не несут в себе полезной информации
    val_cnts = data['oaid_hash'].value_counts()
    print(val_cnts)
    print(val_cnts[val_cnts >= 10])
    print(val_cnts[val_cnts < 10])
    # Есть множесто хешей, которые редко встречаются в датасете


analysis(data)

Dates: ['2021-09-01', '2021-09-26', '2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30', '2021-10-01', '2021-10-02']
Count of null data:  0
2     4589979
0     3856798
1     3178693
4     3012541
3      759767
6      310346
5      111946
8        1273
7          99
9          25
10          5
Name: os_id, dtype: int64
0     4956393
5     1910054
6     1582705
7     1525569
1     1346321
3     1337392
15     603301
12     478038
9      439004
4      388801
10     356490
11     212245
8      210211
13     190855
14     138385
16      79166
2       66542
Name: country_id, dtype: int64
17      2280422
14      1638642
12       736352
0        708379
19       693292
         ...   
2186          1
2188          1
2518          1
2962          1
3443          1
Name: zone_id, Length: 3444, dtype: int64
22      613367
361     387563
3       286999
18      262946
21      246378
         ...  
1501         1
1503         1
1504         1
1507         1
1632         1
Name: banner_id, Length: 

In [5]:
from sklearn.preprocessing import OneHotEncoder

def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    # Удаляем 1е сентября
    data.drop(data[data['date_time'].map(lambda s: s[:10]) == '2021-09-01'].index, inplace=True, axis=0)
    # Преобразуем дату в datetime для дальнейших манипуляций
    data['datetime'] = pd.to_datetime(data['date_time'])
    # Добавим фичи, говорящие о дне недели (поведение пользователей в рабочие дни могут отличаться от выходных)
    data["day_of_week"] = data["datetime"].dt.dayofweek
    # Добавим фичи, времени суток
    data['hour'] = data["datetime"].dt.hour
    # Удаляем непопулярные значения
    zone_mask = data['zone_id'].isin(data.groupby('zone_id')['date_time'].count().sort_values(key=lambda x: -x).index[250:])
    data.loc[zone_mask, 'zone_id'] = 0
    os_mask = data['os_id'].isin(data.groupby('os_id')['date_time'].count().sort_values(key=lambda x: -x).index[7:])
    data.loc[os_mask, 'os_id'] = 0
    banner_mask = data['banner_id'].isin(data.groupby('banner_id')['date_time'].count().sort_values(key=lambda x: -x).index[600:])
    data.loc[banner_mask, 'banner_id'] = 0
    categorical_cols = ["zone_id", "os_id", "day_of_week", "hour", "country_id", "banner_id", "oaid_hash"]
    return categorical_cols, data

categorical_cols, df = feature_engineering(data)

In [6]:
def train_test_split(data: pd.DataFrame):
    train_condition = data['date_time'].apply(lambda s: s.split()[0]) < '2021-10-01'
    test_condition = data['date_time'].apply(lambda s: s.split()[0]) == '2021-10-01'
    validate_condition = data['date_time'].apply(lambda s: s.split()[0]) == '2021-10-02'

    train_data = data[train_condition]
    test_data = data[test_condition]
    validate_data = data[validate_condition]

    train_data = train_data.drop(['date_time', 'datetime'], axis=1)
    test_data = test_data.drop(['date_time', 'datetime'], axis=1)
    validate_data = validate_data.drop(['date_time', 'datetime'], axis=1)

    return train_data, test_data, validate_data

train_data, test_data, validate_data = train_test_split(df)

In [8]:
train_data, test_data, validate_data

(          zone_id  banner_id            oaid_hash  campaign_clicks  os_id  \
 0               0          0  5664530014561852622                0      0   
 1               1          1  5186611064559013950                0      0   
 2               2          2  2215519569292448030                3      0   
 3               3          3  6262169206735077204                0      1   
 4               4          4  4778985830203613115                0      1   
 ...           ...        ...                  ...              ...    ...   
 15821464       30         28  2600917920049590018                0      3   
 15821465       19        159  8924237622541523222                0      0   
 15821468       12         22   453968700792456599                0      1   
 15821470        0         21  6968514095695555037                0      0   
 15821471       19        635  8754492963501134426                0      0   
 
           country_id  impressions  clicks  day_of_week  hour 

In [21]:
from collections import defaultdict
from sklearn.model_selection import GroupShuffleSplit
import math
target = "clicks"
NUMERICAL_FEATURES = ["campaign_clicks"]
CATEGORICAL_FEATURES = categorical_cols

encoder = {
    "currentcode": len(
        NUMERICAL_FEATURES
    ),
    "catdict": {},  
    "catcodes": {},
}  


# Скопированно из https://github.com/wngaw/blog/blob/master/xlearn_example/src/utils.py

def convert_to_ffm(path, df, type, target, numerics, categories, encoder):
    # Flagging categorical and numerical fields
    #print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            #print(f'UPDATING CATDICT: numeric field - {x}')
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            #print(f'UPDATING CATDICT: categorical field - {x}')
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        #print(f'UPDATING CATCODES: categorical field - {x}')
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        #print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        #print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    return encoder


encoder = convert_to_ffm(
    "../data/",
    train_data,
    "train",
    target,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    encoder,
)

encoder = convert_to_ffm(
    "../data/",
    test_data,
    "test",
    target,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    encoder,
)


encoder = convert_to_ffm(
    "../data/",
    pd.concat([train_data, test_data]),
    "train_test",
    target,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    encoder,
)

encoder = convert_to_ffm(
    "../data/",
    validate_data,
    "validate",
    target,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    encoder,
)

In [12]:
from sklearn.metrics import log_loss

import xlearn as xl
from tqdm import tqdm

def cv(lrs, ks, train_path, validation_path):
    d = {lr: {k: None for k in ks} for lr in lrs}
    for lr in tqdm(lrs):
        for k in tqdm(ks):
            model = xl.create_ffm()
            model.setTrain(train_path)
            param = {'task':'binary', 'lr':0.01, 'lambda':1e-2, 'metric':'auc', 'epoch':20, 'k':k}
            model.fit(param, model_path='./model.out')
            model.setSigmoid()
            
            model.setTest(validation_path)
            model.predict("./model.out", "./output.txt")
            d[lr][k] = log_loss(test_data['clicks'], pd.read_csv('output.txt', header=None))
    return d

In [13]:
lrs = [0.01, 0.05, 0.1]
ks = [2, 4, 6]
results = cv(lrs, ks, '../data/train_ffm.txt', '../data/test_ffm.txt')

  0%|                                                                                                                                                                                                                                   | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                   | 0/3 [00:00<?, ?it/s][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.69 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 33%|████████████████████████████████████████████████████████████████████████▋                                                                                                                                                 | 1/3 [02:19<04:38, 139.14s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 11.19 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 2/3 [04:38<02:19, 139.05s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.45 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 2.42 GB
[32m[------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:35<00:00, 151.71s/it][A
 33%|████████████████████████████████████████████████████████████████████████▋                                                                                                                                                 | 1/3 [07:35<15:10, 455.13s/it]
  0%|                                                                                                                                                                                                                                   | 0/3 [00:00<?, ?it/s][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.87 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 33%|████████████████████████████████████████████████████████████████████████▋                                                                                                                                                 | 1/3 [02:16<04:32, 136.22s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 12.36 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 2/3 [04:30<02:14, 134.81s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.39 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 2.42 GB
[32m[------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:15<00:00, 145.12s/it][A
 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 2/3 [14:50<07:23, 443.51s/it]
  0%|                                                                                                                                                                                                                                   | 0/3 [00:00<?, ?it/s][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.68 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 33%|████████████████████████████████████████████████████████████████████████▋                                                                                                                                                 | 1/3 [02:14<04:29, 134.96s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 10.53 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 1.23 GB
[32m[------


 67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 2/3 [04:24<02:12, 132.05s/it][A

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5006389
[32m[------------] [0mNumber of Field: 8
[32m[------------] [0mTime cost for reading problem: 11.72 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 2.42 GB
[32m[------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:23<00:00, 147.98s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [22:14<00:00, 444.82s/it]


In [14]:
tmp = pd.DataFrame(results)
tmp

Unnamed: 0,0.01,0.05,0.10
2,0.166067,0.166041,0.166062
4,0.166102,0.166055,0.166064
6,0.165875,0.165869,0.165886


Наименьший логлосс при k равном 6 и lr равном 0.05. Обучим с этими параметрами модель

In [10]:
from sklearn.metrics import log_loss

import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("../data/train_ffm.txt")
ffm_model.setValidate("../data/test_ffm.txt")
param = {'task':'binary', 'lr':0.05, 'lambda':0.01, 'metric':'auc', 'epoch':20, 'k':6}
ffm_model.fit(param, './model.out')

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/test_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5661319
[32m[

In [13]:
import numpy as np

ffm_model.setTest("../data/validate_ffm.txt") 
# преобразуем в вероятности положительного класса
ffm_model.setSigmoid()
ffm_model.predict("./model.out", "./test_prediction.txt")

res = np.loadtxt('./test_prediction.txt')
log_loss(validate_data['clicks'],  res)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from ./model.out
[32m[------------] [0mLoss function: cross-entropy
[32m[------------] [0mScore function: ffm
[32m[------------] [0mNumber of Feature: 5661319
[32m[------------] [0mNumber of K: 6
[32m[------------] [0mNumber of field: 8
[32m[------------] [0mTime cost for loading model: 1.54 (sec)
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been alrea

0.14684235935112344

In [14]:
print('Значение log loss на последнем для предсказания:')
print('0.15303289904918688 по частоте клика')
print('0.13552848196046108 из первого домашнего задания')
print('0.14684235935112344 из текущего домашнего задания')

Значение log loss на последнем для предсказания:
0.15303289904918688 по частоте клика
0.13552848196046108 из первого домашнего задания
0.14684235935112344 из текущего домашнего задания


Логлосс предикшена FFM оказался хуже логлосса предикшена из первого домашнего задания. Возможно, происходит от того, что много хэшей, который встречаются довольно редко