In [1]:
import sys
sys.path.append("../")

In [2]:
import warnings
warnings.simplefilter('ignore')
from pathlib import Path
import dill
import numpy as np
import pandas as pd
import requests
# import shap

from lightfm import LightFM
from lightfm.data import Dataset
from lightgbm import LGBMRanker, LGBMClassifier, Booster
from xgboost import XGBRanker
from catboost import CatBoostRanker, Pool

import rectools
from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from rectools import Columns
from rectools.models import ImplicitALSWrapperModel, PopularModel
from implicit.als import AlternatingLeastSquares

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from typing import Any, Dict, Tuple
from tqdm.auto import tqdm
from zipfile import ZipFile

from rec_sys.tools import generate_lightfm_recs_mapper, avg_user_metric

## Подготовка данных

In [3]:
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')
interactions = pd.read_csv('../data/interactions.csv')

In [4]:
# Меняем названия колонок для использования rectools
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'watched_pct': Columns.Weight,
    }, 
    inplace=True,
) 

# Меняем тип данных
interactions['datetime'] = interactions['datetime'].astype(np.datetime64)

# Заполняем пропуски
interactions_default_values: Dict[str, Any] = {
   Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.,
    'total_dur': 0,
}
interactions.fillna(interactions_default_values, inplace=True)

In [5]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}  # словарь с категориями

    for col in cat_cols:
        cat_col = df[col].astype('category').cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype('category')
    return df, cat_col_encoding

users_cat_cols = [
     'age', 'income', 'sex', 'kids_flg'
]
users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)

In [6]:
items_cat_cols = [ 
    'content_type', 'for_kids', 'studios',
]

items_text_cols = [
    'title', 'title_orig', 'genres', 'countries', 'directors', 'actors', 'description', 'keywords',
]

items_num_cols = [
    'release_year', 'age_rating', 
]

default_values_items = {
    'release_year': items['release_year'].median(),
    'age_rating': items['age_rating'].median(),
}

items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols) 
items.fillna(default_values_items, inplace=True)

## Трейн-вал-тест сплит

In [7]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [8]:
ranker_days_count = 30

ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

train_size = 0.7
val_size = 0.15
test_size = 0.15 

train_val_users, test_users = train_test_split(
    ranker_data['user_id'].unique(), random_state=42, test_size=test_size
)

train_users, val_users = train_test_split(
    train_val_users, random_state=42, test_size=val_size / (train_size + val_size)  # 15% от общего размера
)

In [9]:
base_models_data = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

In [10]:
users = users.loc[users[Columns.User].isin(base_models_data[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [11]:
items = items.loc[items[Columns.Item].isin(base_models_data[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

In [12]:
dataset = rectools.dataset.Dataset.construct(
    interactions_df=base_models_data,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [13]:
items = items.drop(items_text_cols, axis=1)

## Обучаем модели первого уровня

### LightFM

возьмём lightfm без юзер и айтем фичей

In [14]:
lightfm_dataset = Dataset()
lightfm_user_ids = base_models_data['user_id'].unique()
lightfm_item_ids = base_models_data['item_id'].unique()
lightfm_dataset.fit(lightfm_user_ids, lightfm_item_ids)

In [15]:
interactions_matrix, weights_matrix = lightfm_dataset.build_interactions(
    zip(*base_models_data[['user_id', 'item_id', Columns.Weight]].values.T)
)
weights_matrix = weights_matrix.tocsr()

In [16]:
lfm_model = LightFM(
    no_components=64, 
    learning_rate=0.1, 

    loss='warp', 
    max_sampled=5, 
    random_state=42,
)

num_epochs = 10

for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(weights_matrix)

100%|██████████| 10/10 [00:52<00:00,  5.29s/it]


In [17]:
# save model  
with open(f'../models/lfm_model.dill', 'wb') as f:
    dill.dump(lfm_model, f)

### Popular

для холодных пользователей

In [18]:
pop_model = PopularModel()
pop_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7fae08507250>

In [19]:
# save model
with open('../models/pop_model.dill', 'wb') as f:
   dill.dump(pop_model, f)

## Генерим кандидатов, которыми дополним датасет ранкера

In [20]:
lightfm_mapping = lightfm_dataset.mapping()
lightfm_mapping = {
    'user_id_to_iid': lightfm_mapping[0],
    'item_id_to_iid': lightfm_mapping[2],
}

lightfm_mapping['user_iid_to_id'] = {v: k for k, v in lightfm_mapping['user_id_to_iid'].items()}
lightfm_mapping['item_iid_to_id'] = {v: k for k, v in lightfm_mapping['item_id_to_iid'].items()}

In [21]:
top_N = 50

user_lfm_index = np.array(list(lightfm_mapping['user_id_to_iid'].values()))
item_lfm_index = np.array(list(lightfm_mapping['item_id_to_iid'].values()))

mapper = generate_lightfm_recs_mapper(
    model=lfm_model, 
    N=top_N,
    item_iids=item_lfm_index, 
    user_id_to_iid=lightfm_mapping['user_id_to_iid'],
    item_iid_to_id=lightfm_mapping['item_iid_to_id'],
    known_item_ids=dict(),
    num_threads=32,
)

In [22]:
# скоры и ранги lightfm
lfm_candidates = pd.DataFrame({'user_id': lightfm_user_ids})
lfm_candidates['item_id'], lfm_candidates['lfm_score'] = zip(*lfm_candidates['user_id'].map(mapper))
lfm_candidates = lfm_candidates.explode(['item_id', 'lfm_score'], ignore_index=True)
lfm_candidates['lfm_rank'] = lfm_candidates.groupby('user_id').cumcount() + 1 

lfm_candidates.head(3)

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank
0,176549,9728,1.180826,1
1,176549,7571,1.132079,2
2,176549,10440,1.090371,3


In [23]:
lfm_candidates.to_csv('../data/lfm_candidates.csv', index=False)

In [24]:
popular_candidates = pop_model.recommend(lightfm_user_ids, 
                                         dataset=dataset, 
                                         k=50, 
                                         filter_viewed=False)

popular_candidates.rename(columns={'score': 'popular_score',
                                   'rank': 'popular_rank',}, 
                      inplace=True,)

popular_candidates.head(3)

Unnamed: 0,user_id,item_id,popular_score,popular_rank
0,176549,10440,141889.0,1
1,176549,15297,137128.0,2
2,176549,13865,93403.0,3


In [25]:
candidates = pd.merge(lfm_candidates, popular_candidates,
                      on=['user_id', 'item_id'],
                      how='outer')
lfm_min_score: float =  candidates['lfm_score'].min() - 0.01
lfm_max_rank: int = candidates['lfm_rank'].max() + 1

pop_min_score: float =  candidates['popular_score'].min() - 0.01
pop_max_rank: int = candidates['popular_rank'].max() + 1

default_values = {
        'lfm_score': lfm_min_score, 'lfm_rank': lfm_max_rank,
        'popular_score': pop_min_score, 'popular_rank': pop_max_rank,
        **interactions_default_values,
    }

candidates.fillna(default_values, inplace=True)

candidates.head()

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,popular_score,popular_rank
0,176549,9728.0,1.180826,1.0,76978.0,4.0
1,176549,7571.0,1.132079,2.0,20407.0,16.0
2,176549,10440.0,1.090371,3.0,141889.0,1.0
3,176549,11237.0,1.056374,4.0,22769.0,14.0
4,176549,1785.0,0.922613,5.0,7415.99,51.0


In [26]:
candidates.to_csv('../data/candidates.csv', index=False)

In [27]:
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    metrics = {
        'ndcg@10': NDCG(k = 10),
        'map@10': MAP(k = 10),
        'Precision@10': Precision(k = 10),
        'recall@10': Recall(k = 10),
        'novelty@10': MeanInvUserFreq(k = 10),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ), 
        prev_interactions=(
            base_models_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [base_models_data[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )

In [28]:
models_metrics: Dict[str, Dict[str, float]] = dict()

## Формируем датасет для ранкера

### Генерим фичи для ранкера

In [29]:
# Получаем длину истории юзера 
base_models_data['user_hist'] = (
    base_models_data.groupby('user_id')
    ['item_id'].transform('count')
)
# Получаем популярность контента
base_models_data['item_pop'] = (
    base_models_data.groupby('item_id')
    ['user_id'].transform('count')
)
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data['user_avg_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('mean')
)
# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data['item_avg_hist'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('mean')
)
# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime], 
    ascending=[True, False], 
    ignore_index=True,
    inplace=True,
)
base_models_data['user_last_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('first')
)

In [30]:
# Определяем холодность пользователя
base_models_data['user_cold'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('sum') < 100
).astype(int)

In [31]:
base_models_data.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,user_hist,item_pop,user_avg_pop,item_avg_hist,user_last_pop,user_cold
0,0,6006,2021-07-20,1,0.0,6,5208,41885.0,16.891897,5208,0
1,0,7102,2021-07-19,169,3.0,6,11626,41885.0,20.349475,5208,0
2,0,14359,2021-07-19,130,2.0,6,6053,41885.0,22.546836,5208,0
3,0,15297,2021-07-19,459,0.0,6,137128,41885.0,7.364295,5208,0
4,0,9728,2021-07-19,4,0.0,6,76978,41885.0,11.165736,5208,0


In [32]:
items = pd.merge(
    left=items, 
    right=(
        base_models_data
        [['item_id', 'item_pop', 'item_avg_hist']]
        .drop_duplicates()
    ),
    how='left',
    on='item_id',
)

users = pd.merge(
    left=users, 
    right=(
        base_models_data
        [['user_id', 'user_hist', 'user_avg_pop', 'user_last_pop', 'user_cold']]
        .drop_duplicates()
    ),
    how='left',
    on='user_id',
)
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,user_cold
0,973171,1,4,1,1,5,19550.8,93403,0
1,962099,0,2,1,0,13,1329.307692,260,0
2,721985,3,2,0,0,13,6009.461538,446,0


In [33]:
default_values_items['item_pop'] = base_models_data['item_pop'].median()
default_values_items['item_avg_hist'] = base_models_data['item_avg_hist'].median()

default_values_users = {
    'user_hist': 0,
    'user_avg_pop': base_models_data['user_avg_pop'].median(),
    'user_last_pop': base_models_data['user_last_pop'].median(),
}

### Джойним кандидатов и юзер/айтем фичи

In [54]:
def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame, 
    df: pd.DataFrame,
) -> pd.DataFrame:
    
    df = pd.merge(
        df[df['user_id'].isin(user_list)], 
        candidates_df[candidates_df['user_id'].isin(user_list)], 
        how='right',
        on=['user_id', 'item_id']
    )

    min_score: float =  df['lfm_score'].min() - 0.01
    max_rank: int = df['lfm_rank'].max() + 1  # 101
    
    default_values = {
        'lfm_score': min_score, 'lfm_rank': max_rank,
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)
        
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    return df

ranker_train = users_filter(train_users, candidates, ranker_data)
ranker_val = users_filter(val_users, candidates, ranker_data)
ranker_test = users_filter(test_users, candidates, ranker_data)

In [55]:
# Добавляем фичи
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(
        df, 
        users, 
        how='left', 
        on=['user_id']
    )
    df = pd.merge(
        df, 
        items, 
        how='left', 
        on=['item_id']
    )

    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(-1)
            df.fillna({col: -1}, inplace=True)
    return df

ranker_train = add_features(ranker_train)
ranker_val = add_features(ranker_val)
ranker_test = add_features(ranker_test)

In [56]:
def filter_interations(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['lfm_rank'] <= 100]

ranker_train = filter_interations(ranker_train)
ranker_val = filter_interations(ranker_val)

## Обучение ранкеров

In [57]:
def filter_group(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    groups_df = (
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        .rename(columns={'item_id': 'group_size'})
    )
    df = pd.merge(
        df, 
        groups_df, 
        how='left', 
        on=['user_id']
    )
    # Удаляем группы, без достаточного числа просмотров/кандидатов
    df = df[df['group_size'] >= 100]

    # Колонка больше не нужна
    df.drop(columns=['group_size'], inplace=True)
    return df

ranker_train = filter_group(ranker_train)
ranker_val = filter_group(ranker_val)
ranker_test = filter_group(ranker_test)

In [58]:
cols = [
    'lfm_score', 'lfm_rank',
    'popular_score', 'popular_rank',
    'age', 'income', 'sex', 'kids_flg', 'user_hist', 'user_avg_pop', 'user_last_pop',
    'content_type', 'release_year', 'for_kids', 'age_rating', 'studios', 'item_pop', 'item_avg_hist',
]

cat_cols = [
    'age', 'income', 'sex', 'kids_flg',
    'content_type', 'for_kids', 'studios',
]

In [59]:
def add_score_and_rank(df: pd.DataFrame, y_pred_scores: np.ndarray, name: str) -> pd.DataFrame:
    df[f'{name}_score'] = y_pred_scores
    
    df.sort_values(
        by=['user_id', f'{name}_score'],
        ascending=[True, False],
        inplace=True,
    )
    df[f'{name}_rank'] = df.groupby('user_id').cumcount() + 1

    mask = (df['lfm_rank'] < 101).to_numpy()
    eps: float = 0.001
    min_score: float = min(y_pred_scores) - eps
    df[f'{name}_hybrid_score'] = df[f'{name}_score'] * mask
    df[f'{name}_hybrid_score'].replace(
        0,
        min_score,
        inplace=True,
    )
    
    df[f'{name}_hybrid_rank'] = df[f'{name}_rank'] * mask
    max_rank: int = 101
    df[f'{name}_hybrid_rank'].replace(
        0,
        max_rank,
        inplace=True,
    )
    return df

In [60]:
# таргет
def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df['target_ranker'] = (df[Columns.Weight] >= 15).astype(int)  # 'watched_pct'
    df['target_ranker'] += (df[Columns.Weight] >= 75).astype(int)
    return df

ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,user_cold,content_type,release_year,for_kids,age_rating,studios,genre,item_pop,item_avg_hist,target_ranker
21588,2338,101.0,2021-07-01,0.0,0.0,-7.197162,51.0,9542.0,36.0,1,...,0.0,0,2019.0,-1,18.0,-1,"[историческое, мелодрамы]",9542,17.990673,0
21589,2338,101.0,2021-07-01,0.0,0.0,-7.197162,51.0,9542.0,36.0,1,...,1.0,0,2019.0,-1,18.0,-1,"[историческое, мелодрамы]",9542,17.990673,0
21590,2338,142.0,2021-07-01,0.0,0.0,2.844062,10.0,35862.0,9.0,1,...,0.0,0,2020.0,-1,16.0,-1,"[драмы, триллеры]",35862,15.251464,0


### LGBMRanker

In [61]:
def get_group_lgbm(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        ['item_id']
    )

In [62]:
cols

['lfm_score',
 'lfm_rank',
 'popular_score',
 'popular_rank',
 'age',
 'income',
 'sex',
 'kids_flg',
 'user_hist',
 'user_avg_pop',
 'user_last_pop',
 'content_type',
 'release_year',
 'for_kids',
 'age_rating',
 'studios',
 'item_pop',
 'item_avg_hist']

В другом ноутбуке подобраны параметры получше 

In [63]:
params = {
    'objective': 'lambdarank',
    'n_estimators': 100,
    'max_depth': 4,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'random_state': 42,
}
early_stopping_rounds = 32
fit_params = {
    'X': ranker_train[cols],
    'y': ranker_train['target_ranker'],
    'group': get_group_lgbm(ranker_train),
    'eval_set': [(ranker_val[cols], ranker_val['target_ranker'])],
    'eval_group': [get_group_lgbm(ranker_val)],
    'eval_metric': 'ndcg',
    'eval_at': (3, 5, 10),
    'feature_name': cols,
}
LGBMRanker_model = LGBMRanker(**params)

In [64]:
%%time
LGBMRanker_model.fit(**fit_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1662
[LightGBM] [Info] Number of data points in the train set: 148070, number of used features: 18
CPU times: user 2.57 s, sys: 0 ns, total: 2.57 s
Wall time: 365 ms


In [65]:
y_pred = LGBMRanker_model.predict(ranker_test[cols])
y_pred

array([-1.62052361, -1.62052361, -1.27718097, ..., -0.72059847,
        1.94180365,  1.94180365])

In [66]:
# explainer = shap.Explainer(LGBMRanker_model)
# shap_values = explainer(ranker_test[cols].iloc[:10_000])

# shap.plots.waterfall(shap_values[0], max_display=len(cols))

In [67]:
y_pred: np.ndarray = LGBMRanker_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, 'LGBMRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,age_rating,studios,genre,item_pop,item_avg_hist,target_ranker,LGBMRanker_score,LGBMRanker_rank,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank
1519,955,13865.0,2021-07-01,0.0,0.0,2.427866,7.0,93403.0,3.0,1,...,12.0,-1,"[драмы, военные, приключения]",93403,10.40852,0,1.179449,1,1.179449,1
1520,955,13865.0,2021-07-01,0.0,0.0,2.427866,7.0,93403.0,3.0,1,...,12.0,-1,"[драмы, военные, приключения]",93403,10.40852,0,1.179449,2,1.179449,2
1543,955,15297.0,2021-07-01,0.0,0.0,2.428631,6.0,137128.0,2.0,1,...,18.0,-1,"[драмы, мелодрамы]",137128,7.364295,0,0.901836,3,0.901836,3


In [68]:
models_metrics['LGBMRanker'] = calc_metrics_(ranker_test, 'LGBMRanker_rank')

pd.DataFrame(models_metrics)[['LGBMRanker']]

Unnamed: 0,LGBMRanker
Precision@10,0.000218
map@10,0.000197
ndcg@10,0.000239
novelty@10,3.461403
recall@10,0.000373


### XGBRanker

In [None]:
def get_group_xgb(df: pd.DataFrame) -> np.ndarray:
    return np.array(df['user_id'].value_counts())

In [None]:
fit_params = {
    'X': ranker_train[cols].drop(cat_cols, axis=1),
    'y': ranker_train['target_ranker'],
    'group': get_group_xgb(ranker_train),
    'eval_set': [(ranker_val[cols].drop(cat_cols, axis=1), ranker_val['target_ranker'])],
    'eval_group': [get_group_xgb(ranker_val)],
    'eval_metric': 'ndcg',
    'verbose': early_stopping_rounds / 8,
}
params = {
    'objective': 'rank:ndcg',
    'n_estimators': 100,
    'max_depth': 4,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'random_state': 42,
}

In [None]:
XGBRanker_model = XGBRanker(**params)

In [None]:
%%time
XGBRanker_model.fit(**fit_params)

CPU times: user 3min 28s, sys: 16.4 s, total: 3min 45s
Wall time: 8.19 s


XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, gamma=0, learning_rate=0.25,
          max_delta_step=0, max_depth=4, min_child_samples=100,
          min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
          nthread=None, num_leaves=10, objective='rank:ndcg', random_state=42,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
          subsample=1, verbosity=1)

In [None]:
# save model  
with open(f'../models/xgb_ranker_model.dill', 'wb') as f:
    dill.dump(XGBRanker_model, f)

In [None]:
y_pred: np.ndarray = XGBRanker_model.predict(ranker_test[cols].drop(cat_cols, axis=1))
ranker_test = add_score_and_rank(ranker_test, y_pred, 'XGBRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,item_avg_hist,target_ranker,LGBMRanker_score,LGBMRanker_rank,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank
1677,955,13915.0,2021-07-24,5899.0,100.0,-7.405808,52.0,,,1,...,23.285523,2,1.571146,1,1.571146,1,2.930581,1,2.930581,1
1678,955,13915.0,2021-07-24,5899.0,100.0,-7.405808,52.0,,,1,...,23.285523,2,1.571146,2,1.571146,2,2.930581,2,2.930581,2
1701,955,15404.0,2021-07-23,8171.0,100.0,-7.405808,52.0,,,1,...,32.404789,2,1.429141,9,1.429141,9,2.794412,3,2.794412,3


In [None]:
models_metrics['XGBRanker'] = calc_metrics_(ranker_test, 'XGBRanker_rank')
pd.DataFrame(models_metrics)[['XGBRanker']]

Unnamed: 0,XGBRanker,XGBRanker_hybrid
recall@10,0.007959,0.007959
ndcg@10,0.014762,0.014762


### CatBoostRanker

In [None]:
params = {
    'n_estimators': 100,#00,
    'depth': 4,
    'learning_rate': 0.25,
    'reg_lambda': 1, 
    'random_seed': 42,
    'early_stopping_rounds': early_stopping_rounds,
    'custom_metric': 'NDCG:top=10',
    'verbose': 1,
}

fit_params = {
    'X': Pool(data=ranker_train[cols],
              label=ranker_train['target_ranker'],
              group_id=ranker_train['user_id'].values,
              cat_features=cat_cols),          
    'eval_set': Pool(data=ranker_val[cols],
              label=ranker_val['target_ranker'],
              group_id=ranker_val['user_id'].values,
              cat_features=cat_cols),
    'early_stopping_rounds': early_stopping_rounds,
    'plot': 1,
}

In [None]:
ranker_train.shape, ranker_test.shape, ranker_val.shape

((620510, 26), (129190, 34), (136235, 26))

In [None]:
%%time
CatBoostRanker_model = CatBoostRanker(**params)
CatBoostRanker_model.fit(**fit_params)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.6816722	best: 0.6816722 (0)	total: 280ms	remaining: 27.7s
10:	test: 0.8119610	best: 0.8119610 (10)	total: 2.25s	remaining: 18.3s
20:	test: 0.8194730	best: 0.8195347 (18)	total: 4.2s	remaining: 15.8s
30:	test: 0.8211853	best: 0.8213422 (29)	total: 6.34s	remaining: 14.1s
40:	test: 0.8234045	best: 0.8236070 (39)	total: 8.34s	remaining: 12s
50:	test: 0.8247521	best: 0.8247774 (48)	total: 10.4s	remaining: 9.96s
60:	test: 0.8255586	best: 0.8256412 (59)	total: 12.4s	remaining: 7.91s
70:	test: 0.8260252	best: 0.8266486 (66)	total: 14.3s	remaining: 5.84s
80:	test: 0.8268206	best: 0.8272939 (75)	total: 16.3s	remaining: 3.81s
90:	test: 0.8276336	best: 0.8276336 (90)	total: 18.2s	remaining: 1.8s
99:	test: 0.8273946	best: 0.8277294 (92)	total: 20s	remaining: 0us

bestTest = 0.8277294148
bestIteration = 92

Shrink model to first 93 iterations.
CPU times: user 7min 56s, sys: 29 s, total: 8min 25s
Wall time: 20.3 s


<catboost.core.CatBoostRanker at 0x7fe03f1c5f60>

In [None]:
# save model  
with open(f'../models/CatBoostRanker_model.dill', 'wb') as f:
    dill.dump(CatBoostRanker_model, f)

In [None]:
y_pred = CatBoostRanker_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, 'CatBoostRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank,CatBoostRanker_score,CatBoostRanker_rank,CatBoostRanker_hybrid_score,CatBoostRanker_hybrid_rank
1677,955,13915.0,2021-07-24,5899.0,100.0,-7.405808,52.0,,,1,...,1.571146,1,2.930581,1,2.930581,1,3.88719,1,3.88719,1
1678,955,13915.0,2021-07-24,5899.0,100.0,-7.405808,52.0,,,1,...,1.571146,2,2.930581,2,2.930581,2,3.88719,2,3.88719,2
1599,955,6455.0,2021-08-02,6062.0,100.0,-7.405808,52.0,,,1,...,1.510068,3,2.539743,19,2.539743,19,3.876266,3,3.876266,3


In [None]:
models_metrics['CatBoostRanker'] = calc_metrics_(ranker_test, 'CatBoostRanker_rank')
pd.DataFrame(models_metrics)[['CatBoostRanker']]

Unnamed: 0,CatBoostRanker,CatBoostRanker_hybrid
recall@10,0.007961,0.007961
ndcg@10,0.014768,0.014768


## Итоговые метрики

Time

LGBMRanker     - 7.77 s

XGBRanker      - 27 s

CatBoostRanker - 1min 48s

In [None]:
pd.DataFrame(models_metrics)[['LGBMRanker', 'XGBRanker', 'CatBoostRanker']]

Unnamed: 0,LGBMRanker,XGBRanker,CatBoostRanker
recall@10,0.007974,0.007959,0.007961
ndcg@10,0.014771,0.014762,0.014768


## Вывод:
LGBMRanker работает быстрее и имеет наибольшее знаенчие ndcg@10