# Ноутбук для LightGBM (обучение + предсказания для инференса)

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from pathlib import Path

from lightgbm import LGBMRanker

from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from rectools import Columns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from typing import Any, Dict, Tuple

import warnings
warnings.simplefilter('ignore')

## Подготовка данных

In [2]:
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')
interactions = pd.read_csv('../data/interactions.csv')

In [3]:
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'watched_pct': Columns.Weight,
    }, 
    inplace=True,
) 

# Меняем тип данных
interactions['datetime'] = interactions['datetime'].astype(np.datetime64)

# Заполняем пропуски
interactions_default_values: Dict[str, Any] = {
   Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.,
    'total_dur': 0,
}
interactions.fillna(interactions_default_values, inplace=True)

In [4]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}

    for col in cat_cols:
        cat_col = df[col].astype('category').cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype('category')
    return df, cat_col_encoding

users_cat_cols = [
     'age', 'income', 'sex', 'kids_flg'
]
users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)

In [5]:
items_cat_cols = [
    'content_type', 'for_kids', 'studios',
]
items_text_cols = [
    'title', 'title_orig', 'genres', 'countries', 'directors', 'actors', 'description', 'keywords',
]
items_num_cols = [
    'release_year', 'age_rating', 
]
default_values_items = {
    'release_year': items['release_year'].median(),
    'age_rating': items['age_rating'].median(),
}

items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols) 
items.fillna(default_values_items, inplace=True)

In [6]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


## Поделим датасет по времени

In [7]:
ranker_days_count = 30

ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

In [8]:
base_models_data = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

## Формируем датасет

In [9]:
base_models_data['user_hist'] = (
    base_models_data.groupby('user_id')
    ['item_id'].transform('count')
)

base_models_data['item_pop'] = (
    base_models_data.groupby('item_id')
    ['user_id'].transform('count')
)

base_models_data['user_avg_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('mean')
)

base_models_data['item_avg_hist'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('mean')
)

base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime], 
    ascending=[True, False], 
    ignore_index=True,
    inplace=True,
)

base_models_data['user_last_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('first')
)

In [10]:
# Определяем холодность пользователя

base_models_data['user_cold'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('sum') < 100
).astype(int)

In [11]:
base_models_data.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,user_hist,item_pop,user_avg_pop,item_avg_hist,user_last_pop,user_cold
0,0,6006,2021-07-20,1,0.0,6,5208,41885.0,16.891897,5208,0
1,0,7102,2021-07-19,169,3.0,6,11626,41885.0,20.349475,5208,0
2,0,14359,2021-07-19,130,2.0,6,6053,41885.0,22.546836,5208,0
3,0,15297,2021-07-19,459,0.0,6,137128,41885.0,7.364295,5208,0
4,0,9728,2021-07-19,4,0.0,6,76978,41885.0,11.165736,5208,0


In [12]:
items = pd.merge(
    left=items, 
    right=(
        base_models_data
        [['item_id', 'item_pop', 'item_avg_hist']]
        .drop_duplicates()
    ),
    how='left',
    on='item_id',
)

users = pd.merge(
    left=users, 
    right=(
        base_models_data
        [['user_id', 'user_hist', 'user_avg_pop', 'user_last_pop', 'user_cold']]
        .drop_duplicates()
    ),
    how='left',
    on='user_id',
)
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,user_cold
0,973171,1,4,1,1,5.0,19550.8,93403.0,0.0
1,962099,0,2,1,0,13.0,1329.307692,260.0,0.0
2,1047345,3,3,0,0,,,,


In [13]:
default_values_items['item_pop'] = base_models_data['item_pop'].median()
default_values_items['item_avg_hist'] = base_models_data['item_avg_hist'].median()

default_values_users = {
    'user_hist': 0,
    'user_avg_pop': base_models_data['user_avg_pop'].median(),
    'user_last_pop': base_models_data['user_last_pop'].median(),
}

### Джойним кандидатов и юзер/айтем фичи

кандидаты от моделей первого уровня генерятся в другом ноутбуке

In [14]:
candidates = pd.read_csv('../data/candidates.csv')
candidates.head(3)

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,popular_score,popular_rank
0,176549,9728.0,1.215133,1.0,76978.0,4.0
1,176549,10440.0,1.139943,2.0,141889.0,1.0
2,176549,13865.0,1.022879,3.0,93403.0,3.0


In [16]:
def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame, 
    df: pd.DataFrame,
) -> pd.DataFrame:
    df = pd.merge(
        left=df[df['user_id'].isin(user_list)], 
        right=candidates_df[candidates_df['user_id'].isin(user_list)], 
        how='right',
        on=['user_id', 'item_id']
    )

    min_score: float =  df['lfm_score'].min() - 0.01
    max_rank: int = df['lfm_rank'].max() + 1  # 101
    
    default_values = {
        'lfm_score': min_score, 'lfm_rank': max_rank,
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    return df

In [17]:
ranker_df = users_filter(candidates['user_id'].unique(), candidates, ranker_data)

In [18]:
# Добавляем фичи
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    
    df = pd.merge(
        df, 
        users, 
        how='left', 
        on=['user_id']
    )

    df = pd.merge(
        df, 
        items, 
        how='left', 
        on=['item_id']
    )

    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(-1)
            df.fillna({col: -1}, inplace=True)
    return df

In [19]:
ranker_df = add_features(ranker_df)

In [20]:
def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df['target_ranker'] = (df[Columns.Weight] >= 15).astype(int) 
    df['target_ranker'] += (df[Columns.Weight] >= 75).astype(int)
    return df

ranker_df = add_target(ranker_df)

In [21]:
train_users, test_users = train_test_split(
    ranker_df['user_id'].unique(), random_state=41, test_size=0.2
)
ranker_train = ranker_df[ranker_df['user_id'].isin(train_users)]
ranker_val = ranker_df[ranker_df['user_id'].isin(test_users)]

def get_group_lgbm(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        ['item_id']
    )

In [22]:
cols = ['lfm_score', 'lfm_rank',
        'popular_score', 'popular_rank',  
        'age', 'income', 'sex', 'kids_flg', 
        'user_hist', 'user_avg_pop', 'user_last_pop',
        'content_type', 'release_year', 'for_kids', 
        'age_rating', 'studios', 'item_pop', 'item_avg_hist',
]

params = {
    'objective': 'lambdarank',
    'n_estimators': 1000,
    'max_depth': 10,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'random_state': 42,
    'early_stopping_rounds': 100,
}

In [23]:
X_train = np.array(ranker_train[cols])
y_train = np.array(ranker_train['target_ranker'])
X_val = np.array(ranker_val[cols])
y_val = np.array(ranker_val['target_ranker'])

In [24]:
early_stopping_rounds = 32
fit_params = {
    'X': X_train,
    'y': y_train,
    'group': get_group_lgbm(ranker_train),
    'eval_set': [(X_val, y_val)],
    'eval_group': [get_group_lgbm(ranker_val)],
    'eval_metric': 'ndcg',
    'eval_at': (3, 5, 10),
}
LGBMRanker_model = LGBMRanker(**params)

In [25]:
%%time
LGBMRanker_model.fit(**fit_params, )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.909332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1662
[LightGBM] [Info] Number of data points in the train set: 38872127, number of used features: 18
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[205]	valid_0's ndcg@3: 0.913069	valid_0's ndcg@5: 0.919413	valid_0's ndcg@10: 0.925852
CPU times: user 24min 53s, sys: 2.65 s, total: 24min 56s
Wall time: 3min 19s


In [26]:
LGBMRanker_model.best_score_['valid_0']

OrderedDict([('ndcg@3', 0.9130688177294668),
             ('ndcg@5', 0.919412841981759),
             ('ndcg@10', 0.9258518540270024)])

In [28]:
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    
    metrics = {
        'ndcg@10': NDCG(k = 10),
        'map@10': MAP(k = 10),
        'Precision@10': Precision(k = 10),
        'recall@10': Recall(k = 10),
        'novelty@10': MeanInvUserFreq(k = 10),
    }

    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ), 
        prev_interactions=(
            base_models_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [base_models_data[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )

In [29]:
def add_score_and_rank(df: pd.DataFrame, y_pred_scores: np.ndarray, name: str) -> pd.DataFrame:
    df[f'{name}_score'] = y_pred_scores
    
    df.sort_values(
        by=['user_id', f'{name}_score'],
        ascending=[True, False],
        inplace=True,
    )
    
    df[f'{name}_rank'] = df.groupby('user_id').cumcount() + 1
    return df

In [31]:
%%time
y_pred = LGBMRanker_model.predict(ranker_df[cols])
ranker_df = add_score_and_rank(ranker_df, y_pred, 'LGBMRanker')

CPU times: user 4min 44s, sys: 15.5 s, total: 4min 59s
Wall time: 1min 21s


In [32]:
calc_metrics_(ranker_val, 'lfm_rank')

{'Precision@10': 0.049571784490812835,
 'recall@10': 0.16561972709456318,
 'ndcg@10': 0.056188324309528415,
 'map@10': 0.06950526020846529,
 'novelty@10': 4.502567471410776}

In [33]:
calc_metrics_(ranker_val, 'popular_rank')

{'Precision@10': 0.053843558600643625,
 'recall@10': 0.180522691021291,
 'ndcg@10': 0.0627867984346407,
 'map@10': 0.07912127927413679,
 'novelty@10': 3.476069828281411}

In [34]:
calc_metrics_(ranker_val, 'LGBMRanker_rank')

{'Precision@10': 0.061585175957645594,
 'recall@10': 0.20586530287469296,
 'ndcg@10': 0.07327564601596924,
 'map@10': 0.09424722607577088,
 'novelty@10': 3.917867331942583}

По метрикам lgbm лучше, чем популярное и lightfm

In [44]:
ranker_df

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,user_cold,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,item_pop,item_avg_hist,target_ranker,LGBMRanker_score,LGBMRanker_rank
38,0,9728.0,2021-07-01,0.0,0.0,3.602555,4.0,76978.0,4.0,5,2,0,0,6.0,41885.0,5208.0,0.0,0,Гнев человеческий,Wrath of Man,2021.0,"боевики, триллеры","Великобритания, США",-1,18.0,-1,Гай Ричи,"Джейсон Стэйтем, Холт МакКэллани, Джеффри Доно...",Грузовики лос-анджелесской инкассаторской комп...,"ограбление, криминальный авторитет, месть, пер...",76978.0,11.165736,0,1.968856,1
41,0,10440.0,2021-07-01,0.0,0.0,4.009995,1.0,141889.0,1.0,5,2,0,0,6.0,41885.0,5208.0,0.0,1,Хрустальный,Khrustal'nyy,2021.0,"триллеры, детективы",Россия,-1,18.0,-1,Душан Глигоров,"Антон Васильев, Николай Шрайбер, Екатерина Оль...",Сергей Смирнов — один из лучших «охотников на ...,"хруст, хрусталь, хруста, хрус, полицейский, пе...",141889.0,8.068716,0,1.730989,2
62,0,15297.0,2021-07-01,0.0,0.0,3.601663,5.0,137128.0,2.0,5,2,0,0,6.0,41885.0,5208.0,0.0,1,Клиника счастья,Klinika schast'ya,2021.0,"драмы, мелодрамы",Россия,-1,18.0,-1,Александр Кириенко,"Дарья Мороз, Анатолий Белый, Данил Акутин, Мар...","Успешный сексолог Алена уверена, что нашла фор...","Клиника счастья, Клиника, Счастье, Клиника сча...",137128.0,7.364295,0,1.581636,3
17,0,4151.0,2021-07-01,0.0,0.0,3.567085,7.0,69641.0,5.0,5,2,0,0,6.0,41885.0,5208.0,0.0,1,Секреты семейной жизни,,2021.0,комедии,Россия,-1,18.0,-1,Шота Гамисония,"Петр Скворцов, Алена Михайлова, Федор Лавров, ...",У Никиты и Полины всё начиналось прекрасно: об...,"брызги крови, кровь, жестокое обращение с живо...",69641.0,10.807886,0,1.563301,4
53,0,13865.0,2021-07-01,0.0,0.0,3.583035,6.0,93403.0,3.0,5,2,0,0,6.0,41885.0,5208.0,0.0,0,Девятаев,V2. Escape from Hell,2021.0,"драмы, военные, приключения",Россия,-1,12.0,-1,Тимур Бекмамбетов,"Павел Прилучный, Павел Чинарёв, Тимофей Трибун...",Военно-исторический блокбастер от режиссёров Т...,"Девятаев, Девятаева, Девят, Девя, Девята, Девя...",93403.0,10.408520,0,1.107333,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48586826,1097556,10464.0,2021-07-01,0.0,0.0,-7.197162,51.0,9489.0,37.0,0,2,0,0,1.0,1620.0,1620.0,0.0,0,Вирус страха,Before the fire,2020.0,"драмы, триллеры",США,-1,16.0,-1,Чарли Бюхлер,"Дженна Линг, Джексон Дэвис, Райан Виджилант, Ч...",Из-за начавшейся пандемии известная актриса вы...,"выживальщик, нелегальная милиция, 2020, соедин...",9489.0,21.203499,0,-3.146101,64
48586837,1097556,12981.0,2021-07-01,0.0,0.0,-7.197162,51.0,9306.0,39.0,0,2,0,0,1.0,1620.0,1620.0,0.0,0,Томирис,Tomiris,2020.0,"боевики, драмы, историческое, военные",Казахстан,-1,16.0,-1,Акан Сатаев,"Альмира Турсын, Адиль Ахметов, Берик Айтжанов,...","Среди всех древних народов, населяющих террито...","2020, казахстан, томирис",9306.0,22.071782,0,-3.146101,65
48586833,1097556,12463.0,2021-07-01,0.0,0.0,-7.197162,51.0,7447.0,49.0,0,2,0,0,1.0,1620.0,1620.0,0.0,0,Студентка по вызову,Mes chères études,2010.0,"драмы, мелодрамы",Франция,-1,18.0,-1,Эмманюэль Берко,"Дебора Франсуа, Ален Коши, Матье Деми, Бенжаме...",Лаура — 19-летняя первокурсница французского у...,"Франция, гостиница, по роману или книге, гости...",7447.0,19.088492,0,-3.331159,66
48586785,1097556,849.0,2021-07-01,0.0,0.0,-7.197162,51.0,13304.0,27.0,0,2,0,0,1.0,1620.0,1620.0,0.0,0,Дебошир,The Brawler,2018.0,"историческое, биография, криминал, драмы, спор...",США,-1,18.0,-1,Кен Кушнер,"Зэк Макгоун, Эми Смарт, Джо Пантольяно, Тэрин ...",Легендарный боксёр Чак Вепнер (на экране его и...,"2018, соединенные штаты, дебошир",13304.0,16.444678,0,-3.396294,67


In [45]:
ranker_df = ranker_df.sort_values(by='LGBMRanker_rank', ascending=True)

In [46]:
ranker_inference_10 = ranker_df.groupby('user_id').head(10)  # head чтобы было 10 айтемов в выдаче
ranker_recos = ranker_inference_10.groupby('user_id')['item_id'].agg(list).reset_index()
ranker_recos

Unnamed: 0,user_id,item_id
0,0,"[9728.0, 10440.0, 15297.0, 4151.0, 13865.0, 37..."
1,2,"[5411.0, 10761.0, 9728.0, 7626.0, 15297.0, 129..."
2,3,"[10440.0, 15297.0, 4151.0, 9728.0, 13865.0, 16..."
3,4,"[9728.0, 10440.0, 13865.0, 15297.0, 8636.0, 41..."
4,5,"[7248.0, 10440.0, 9728.0, 5651.0, 15297.0, 570..."
...,...,...
720870,1097551,"[9728.0, 10440.0, 15297.0, 13865.0, 3734.0, 41..."
720871,1097553,"[9728.0, 10440.0, 13865.0, 15297.0, 4151.0, 37..."
720872,1097554,"[9728.0, 10440.0, 13865.0, 15297.0, 4151.0, 37..."
720873,1097555,"[10440.0, 13865.0, 9728.0, 15297.0, 3734.0, 41..."


In [70]:
ranker_recos.shape

(720875, 2)

In [52]:
ranker_recos[ranker_recos.user_id == 0].item_id.tolist()[0]

[9728.0,
 10440.0,
 15297.0,
 4151.0,
 13865.0,
 3734.0,
 12192.0,
 142.0,
 4457.0,
 6809.0]

In [53]:
ranker_recos[ranker_recos.user_id == 2].item_id.tolist()[0]

[5411.0,
 10761.0,
 9728.0,
 7626.0,
 15297.0,
 12965.0,
 16166.0,
 10440.0,
 11749.0,
 2722.0]

In [54]:
import pickle
pickle.dump(ranker_recos, open("../data/ranker_recos.pickle", "wb"))