In [198]:
import pandas as pd
from typing import Dict, Any, Tuple
from pathlib import Path
from rectools import Columns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from rectools.dataset import Dataset as data
import dill
import warnings
from tqdm.auto import tqdm
from lightfm import LightFM
from xgboost import XGBRanker
from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from tools import generate_lightfm_recs_mapper
from rectools.models import PopularModel
from lightfm.data import Dataset
from lightgbm import LGBMRanker
from catboost import CatBoostRanker, Pool
import pickle
warnings.filterwarnings('ignore')

### Подготовка данных

In [99]:
DATA_PATH = Path("/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [100]:
interactions.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                                'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)

In [None]:
interactions[Columns.Datetime] = interactions[Columns.Datetime].astype(np.datetime64)
interactions_default_values: Dict[str, Any] = {
   Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.,
    'total_dur': 0,
}
interactions.fillna(interactions_default_values, inplace=True)

interactions.head(10)

In [None]:
interactions.describe(datetime_is_numeric=True)

In [103]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}
    for col in cat_cols:
        cat_col = df[col].astype('category').cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype('category')
    return df, cat_col_encoding


### Данные о пользователе

In [None]:
users.isnull().mean()

In [105]:
def fill_na(df, cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mode().iloc[0])
    return df

In [106]:
cols_with_null = ['age', 'income', 'sex']
users = fill_na(users, cols_with_null)

In [None]:
users.isnull().mean()

In [None]:
print(users.shape)
users.head(3)

In [109]:
user_cat_feats = ["age", "income", "sex", "kids_flg"]
users_ohe_df = users.user_id
for feat in user_cat_feats:
    ohe_feat_df = pd.get_dummies(users[feat], prefix=feat)
    users_ohe_df = pd.concat([users_ohe_df, ohe_feat_df], axis=1)

users_ohe_df.head()

Unnamed: 0,user_id,age_age_18_24,age_age_25_34,age_age_35_44,age_age_45_54,age_age_55_64,age_age_65_inf,income_income_0_20,income_income_150_inf,income_income_20_40,income_income_40_60,income_income_60_90,income_income_90_150,sex_Ж,sex_М,kids_flg_0,kids_flg_1
0,973171,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
1,962099,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
2,1047345,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0
3,721985,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0
4,704055,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0


In [110]:
users_cat_cols = [
     'age', 'income', 'sex', 'kids_flg'
]
users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)
users_cat_col_encoding

{'age': Index(['age_18_24', 'age_25_34', 'age_35_44', 'age_45_54', 'age_55_64',
        'age_65_inf'],
       dtype='object'),
 'income': Index(['income_0_20', 'income_150_inf', 'income_20_40', 'income_40_60',
        'income_60_90', 'income_90_150'],
       dtype='object'),
 'sex': Index(['Ж', 'М'], dtype='object'),
 'kids_flg': Int64Index([0, 1], dtype='int64')}

In [None]:
users

In [112]:
users.shape

(840197, 5)

### Обработка данных для items

In [113]:
print(items.shape)
items.head(3)

(15963, 14)


Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [114]:
items.isnull().mean()

item_id         0.000000
content_type    0.000000
title           0.000000
title_orig      0.297250
release_year    0.006139
genres          0.000000
countries       0.002318
for_kids        0.964543
age_rating      0.000125
studios         0.933283
directors       0.094531
actors          0.164067
description     0.000125
keywords        0.026499
dtype: float64

In [115]:
item_cols_with_null = ['release_year', 'age_rating']
items = fill_na(items, item_cols_with_null)
items = items.drop(columns=['title_orig', 'for_kids', 'studios', 'keywords', 'actors'])
items = items.dropna(axis=0)
print(items.shape)

(14451, 9)


In [116]:
items = items.reset_index(drop=True)

In [117]:
def categorize_popularity(popularity):
    if popularity >= 0.75:
        return 'High Popularity'
    elif popularity >= 0.4:
        return 'Medium Popularity'
    else:
        return 'Low Popularity'

items['directors'] = items['directors'].str.split(', ')
items_df_exploded = items.explode('directors')
interactions_directors = interactions.merge(items_df_exploded, on='item_id')
director_popularity = interactions_directors.groupby('directors')['weight'].mean().reset_index()
director_popularity['director_category'] = director_popularity['weight'].apply(categorize_popularity)
items_df_exploded = items_df_exploded.merge(director_popularity, on='directors', how='left')
grouped = items_df_exploded.groupby(items_df_exploded.index)
items['directors_cat'] = grouped['director_category'].agg(lambda x: ', '.join(x.dropna().unique()))

In [None]:
item_cat_feats = ['age_rating', 'directors_cat']

items_ohe_df = items.item_id

for feat in item_cat_feats:
    ohe_feat_df = pd.get_dummies(items[feat], prefix=feat)
    items_ohe_df = pd.concat([items_ohe_df, ohe_feat_df], axis=1)


min_year = items['release_year'].min()
max_year = items['release_year'].max()
items['release_year'] = (items['release_year'] - min_year) / (max_year - min_year)


items_ohe_df.head()

In [None]:
items['countries'] = items.countries.str.split(',')
mlb = MultiLabelBinarizer()
one_hot_encoded = pd.DataFrame(mlb.fit_transform(items['countries']), columns=mlb.classes_)
items_ohe_df = pd.concat([items_ohe_df, one_hot_encoded], axis=1)
items_ohe_df.head()

In [120]:
items.explode('genres')['genres'].nunique()

2596

In [121]:
items['genres'] = items['genres'].apply(lambda x: ', '.join(x))

In [122]:
pd.DataFrame({'genres': list(items['genres'].unique())}).to_csv('genres.csv')

In [123]:
def assign_region(x):
    regions = {
        'Северная Америка': ['США', 'Канада', 'Мексика'],
        'Южная Америка': ['Аргентина', 'Бразилия', 'Чили', 'Перу', 'Колумбия', 'Уругвай'],
        'Европа': ['Испания', 'Франция', 'Германия', 'Италия', 'Великобритания', 'Швеция', 'Норвегия', 'Дания', 'Финляндия', 'Польша', 'Чехия', 'Венгрия', 'Румыния', 'Бельгия', 'Нидерланды', 'Люксембург', 'Австрия', 'Швейцария', 'Португалия', 'Греция', 'Болгария', 'Ирландия'],
        'СНГ': ['Россия', 'Украина', 'Беларусь', 'Казахстан', 'Узбекистан', 'Армения', 'Киргизия'],
        'Азия': ['Китай', 'Япония', 'Южная Корея', 'Индия', 'Таиланд', 'Малайзия'],
        'Ближний Восток и Северная Африка': ['Иран', 'Израиль', 'Египет', 'Турция'],
        'Субсахарская Африка': ['ЮАР', 'Нигерия', 'Кения'],
        'Австралия и Океания': ['Австралия', 'Новая Зеландия'],
        'Латинская Америка': ['Мексика', 'Куба', 'Пуэрто-Рико']
    }
    for region, country_list in regions.items():
        for elem in x:
            if elem.strip() in country_list:
                return region

    return 'Другой'

items['countries'] = items['countries'].apply(lambda x: assign_region(x))

In [None]:
items_cat_cols = ['content_type', 'age_rating', 'countries', 'directors_cat']
items_text_cols = ['title', 'description', 'genres', 'directors']
items_num_cols = ['release_year']
default_values_items = {
    'release_year': items['release_year'].median(),
    'age_rating': items['age_rating'].mode(),
}


items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols)
items = items.drop(items_text_cols, axis=1)
items.fillna(default_values_items, inplace=True)

items

###  Трэйн-вал-тест сплит

In [125]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')


min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [126]:
ranker_days_count = 30

ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

train_size = 0.7
val_size = 0.15
test_size = 0.15


train_val_users, test_users = train_test_split(
    ranker_data['user_id'].unique(), random_state=42, test_size=test_size
)

train_users, val_users = train_test_split(
    train_val_users, random_state=42, test_size=val_size / (train_size + val_size)
)


In [127]:
base_models_data = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

### Обучим модель первого уровня

In [128]:
users_list = list(base_models_data[Columns.User].unique())
users_features = users_ohe_df[users_ohe_df['user_id'].isin(users_list)].copy()

In [129]:
items_list = list(base_models_data[Columns.Item].unique())
items_features = items_ohe_df[items_ohe_df['item_id'].isin(items_list)].copy()

In [130]:
items_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13650 entries, 0 to 14450
Columns: 165 entries, item_id to Япония
dtypes: int64(155), uint8(10)
memory usage: 16.4 MB


In [131]:
lightfm_dataset = Dataset()
lightfm_user_ids = base_models_data['user_id'].unique()
lightfm_item_ids = base_models_data['item_id'].unique()

In [132]:
lightfm_dataset.fit(
    users=lightfm_user_ids,
    items=lightfm_item_ids,
    user_features=[str(f) for f in users_features.columns[1:]],
    item_features=[str(f) for f in items_features.columns[1:]]
)

interactions_matrix, weights_matrix = lightfm_dataset.build_interactions(
    zip(*base_models_data[['user_id', 'item_id', 'weight']].values.T)
)

user_features_matrix = lightfm_dataset.build_user_features(
    (row[0], dict(zip(users_features.columns[1:], row[1:]))) for row in users_features.itertuples(index=False)
)
item_features_matrix = lightfm_dataset.build_item_features(
    (row[0], dict(zip(items_features.columns[1:], row[1:]))) for row in items_features.itertuples(index=False)
)

In [133]:
lfm_model = LightFM(
    no_components=64,
    learning_rate=0.1,
    loss='warp',
    max_sampled=5,
    random_state=104,
)

num_epochs = 20
for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(
        interactions=weights_matrix,
        user_features=user_features_matrix,
        item_features=item_features_matrix,
        epochs=1
    )


  0%|          | 0/20 [00:00<?, ?it/s]

In [134]:
with open(f'/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/lfm_model.dill', 'wb') as f:
    dill.dump(lfm_model, f)


### Дополню популярными

In [135]:
dataset = data.construct(
    interactions_df=base_models_data,
)

In [136]:
pop_model = PopularModel()
pop_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7fb77276c2b0>

### Генерим кандидатов для датасета ранкера


In [137]:
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    metrics = {
        'ndcg@10': NDCG(k = 10),
        'map@10': MAP(k = 10),
        'Precision@10': Precision(k = 10),
        'recall@10': Recall(k = 10),
        'novelty@10': MeanInvUserFreq(k = 10),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ),
        prev_interactions=(
            base_models_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [base_models_data[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )

In [138]:
lightfm_mapping = lightfm_dataset.mapping()
lightfm_mapping = {
    'user_id_to_iid': lightfm_mapping[0],
    'item_id_to_iid': lightfm_mapping[2],
}
lightfm_mapping['user_iid_to_id'] = {v: k for k, v in lightfm_mapping['user_id_to_iid'].items()}
lightfm_mapping['item_iid_to_id'] = {v: k for k, v in lightfm_mapping['item_id_to_iid'].items()}


In [139]:
top_N = 100

user_lfm_index = np.array(list(lightfm_mapping['user_id_to_iid'].values()))
item_lfm_index = np.array(list(lightfm_mapping['item_id_to_iid'].values()))

mapper = generate_lightfm_recs_mapper(
    model=lfm_model,
    N=top_N,
    item_iids=item_lfm_index,
    user_id_to_iid=lightfm_mapping['user_id_to_iid'],
    item_iid_to_id=lightfm_mapping['item_iid_to_id'],
    known_item_ids=dict(),
    num_threads=4,
    user_features = user_features_matrix,
    item_features = item_features_matrix
)


In [None]:
candidates = pd.DataFrame({'user_id': lightfm_user_ids})
candidates['item_id'], candidates['lfm_score'] = zip(*candidates['user_id'].map(mapper))
candidates = candidates.explode(['item_id', 'lfm_score'], ignore_index=True)
candidates['lfm_rank'] = candidates.groupby('user_id').cumcount() + 1

candidates.head(3)

In [141]:
candidates.to_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/lfm.csv', index=False)

In [142]:
candidates.shape

(72087500, 4)

In [143]:
pop_candidates = pop_model.recommend(lightfm_user_ids,
                                         dataset=dataset,
                                         k=100,
                                         filter_viewed=False)

pop_candidates.rename(columns={'score': 'pop_score',
                                   'rank': 'pop_rank',},
                      inplace=True,)

pop_candidates

Unnamed: 0,user_id,item_id,pop_score,pop_rank
0,176549,10440,141889.0,1
1,176549,15297,137128.0,2
2,176549,13865,93403.0,3
3,176549,9728,76978.0,4
4,176549,4151,69641.0,5
...,...,...,...,...
72087495,805174,14526,4402.0,96
72087496,805174,7582,4348.0,97
72087497,805174,10436,4327.0,98
72087498,805174,5803,4318.0,99


In [144]:
pop_candidates.to_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/pop.csv', index=False)

In [145]:
candidates = pd.merge(candidates, pop_candidates,
                      on=['user_id', 'item_id'],
                      how='outer')

In [146]:
candidates.head()

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,pop_score,pop_rank
0,176549,9728.0,-400.707123,1.0,76978.0,4.0
1,176549,11310.0,-400.916046,2.0,7990.0,44.0
2,176549,849.0,-400.941833,3.0,13304.0,27.0
3,176549,13018.0,-400.945129,4.0,11850.0,28.0
4,176549,16361.0,-401.025757,5.0,7401.0,52.0


In [147]:
def normalize_column(column):
    max_value = column.max()
    min_value = column.min()
    normalized_column = (column - min_value) / (max_value - min_value)
    return normalized_column

In [148]:
normalize_columns = ['lfm_score', 'lfm_rank', 'pop_score', 'pop_rank']
for column in normalize_columns:
    candidates[column] = normalize_column(candidates[column])

In [149]:
candidates.head(3)

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,pop_score,pop_rank
0,176549,9728.0,0.085822,0.0,0.528212,0.030303
1,176549,11310.0,0.085381,0.010101,0.026791,0.434343
2,176549,849.0,0.085327,0.020202,0.065414,0.262626


In [None]:
candidates.isnull().mean()

In [151]:
values_fill_nan = {
    'lfm_score': candidates['lfm_score'].min() - 1e-5,
    'lfm_rank': candidates['lfm_rank'].max() + 1e-5,
    'pop_score': candidates['pop_score'].min() - 1e-5,
    'pop_rank': candidates['pop_rank'].max() + 1e-5,
}

candidates = candidates.fillna(values_fill_nan )

In [None]:
candidates.isnull().mean()

In [153]:
candidates.to_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/candidates.csv', index=False)

In [154]:
candidates = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/candidates.csv')

In [155]:
candidates.head(2)

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,pop_score,pop_rank
0,176549,9728.0,0.085822,0.0,0.528212,0.030303
1,176549,11310.0,0.085381,0.010101,0.026791,0.434343


### Метрики для моделей первого уровня

In [156]:
lfm = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/lfm.csv')
pop = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/pop.csv')

In [157]:
models_metrics: Dict[str, Dict[str, float]] = dict()
models_metrics['lfm'] = calc_metrics_(lfm, 'lfm_rank')
models_metrics['lfm']

{'Precision@10': 0.022376559790032693,
 'recall@10': 0.07446109567159644,
 'ndcg@10': 0.026532352781983133,
 'map@10': 0.03348267891059996,
 'novelty@10': 3.594834322035065}

In [158]:
models_metrics: Dict[str, Dict[str, float]] = dict()
models_metrics['pop'] = calc_metrics_(pop, 'pop_rank')
models_metrics['pop']

{'Precision@10': 0.02376406305158627,
 'recall@10': 0.08024655639757222,
 'ndcg@10': 0.027791460074538817,
 'map@10': 0.034996094471142325,
 'novelty@10': 2.8580652138942435}

### Формирование датасета для ранкера

### Генерация фичей

In [None]:
# Получаем длину истории юзера
base_models_data['user_hist'] = (
    base_models_data.groupby('user_id')
    ['item_id'].transform('count')
)
# Получаем популярность контента
base_models_data['item_pop'] = (
    base_models_data.groupby('item_id')
    ['user_id'].transform('count')
)
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data['user_avg_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('mean')
)
# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data['item_avg_hist'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('mean')
)
# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime],
    ascending=[True, False],
    ignore_index=True,
    inplace=True,
)
base_models_data['user_last_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('first')
)
base_models_data.head(3)


In [None]:
# Добавляем новые фичи в соответствующие таблички
items = pd.merge(
    left=items,
    right=(
        base_models_data
        [['item_id', 'item_pop', 'item_avg_hist']]
        .drop_duplicates()
    ),
    how='left',
    on='item_id',
)

users = pd.merge(
    left=users,
    right=(
        base_models_data
        [['user_id', 'user_hist', 'user_avg_pop', 'user_last_pop']]
        .drop_duplicates()
    ),
    how='left',
    on='user_id',
)
users.head(3)

In [None]:
items.head(3)

In [163]:
default_values_items['item_pop'] = base_models_data['item_pop'].median()
default_values_items['item_avg_hist'] = base_models_data['item_avg_hist'].median()

default_values_users = {
    'user_hist': 0,
    'user_avg_pop': base_models_data['user_avg_pop'].median(),
    'user_last_pop': base_models_data['user_last_pop'].median(),
}

### Джойним кандидатов

In [None]:
candidates

In [None]:
def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame,
    df: pd.DataFrame,
) -> pd.DataFrame:
    df = pd.merge(
        df[df['user_id'].isin(user_list)],
        candidates_df[candidates_df['user_id'].isin(user_list)],
        how='right',
        on=['user_id', 'item_id']
    )
    min_score: float =  df['lfm_score'].min() - 1e-5
    max_rank: int = df['lfm_rank'].max() + 1e-5

    default_values = {
        'lfm_score': min_score, 'lfm_rank': max_rank,
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)

    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    return df

ranker_train = users_filter(train_users, candidates, ranker_data)
ranker_val = users_filter(val_users, candidates, ranker_data)
ranker_test = users_filter(test_users, candidates, ranker_data)

ranker_train.head(3)

In [165]:
default_values_items['age_rating'] = items['age_rating'].mode().iloc[0]

In [None]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(
        df,
        users,
        how='left',
        on=['user_id']
    )
    df = pd.merge(
        df,
        items,
        how='left',
        on=['item_id']
    )
    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            if -1 not in df[col].cat.categories:
                df[col].cat.add_categories(-1, inplace=True)
            df[col].fillna(-1, inplace=True)

    return df

ranker_train = add_features(ranker_train)
ranker_val = add_features(ranker_val)
ranker_test = add_features(ranker_test)

ranker_train.head(3)

### Обучаем ранкер

In [167]:
def filter_group(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    groups_df = (
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        .rename(columns={'item_id': 'group_size'})
    )
    df = pd.merge(
        df,
        groups_df,
        how='left',
        on=['user_id']
    )
    df = df[df['group_size'] >= 100]

    df.drop(columns=['group_size'], inplace=True)
    return df

ranker_train_copy = filter_group(ranker_train)
ranker_val_copy = filter_group(ranker_val)
ranker_test_copy = filter_group(ranker_test)

In [168]:
cols = [
    'lfm_score', 'lfm_rank',
    'pop_score', 'pop_rank',
    'age', 'income', 'sex', 'kids_flg', 'user_hist', 'user_avg_pop', 'user_last_pop',
    'content_type', 'release_year', 'countries', 'age_rating', 'directors_cat', 'item_pop', 'item_avg_hist',
]

cat_cols = [
    'age', 'income', 'sex', 'kids_flg',
    'content_type', 'countries', 'age_rating', 'directors_cat'
]

In [None]:
def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df['target_ranker'] = (df[Columns.Weight] >= 15).astype(int)
    df['target_ranker'] += (df[Columns.Weight] >= 75).astype(int)
    return df

ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

ranker_train.head(3)

In [170]:
def add_score_and_rank(df: pd.DataFrame, y_pred_scores: np.ndarray, name: str) -> pd.DataFrame:
    df[f'{name}_score'] = y_pred_scores
    df.sort_values(
        by=['user_id', f'{name}_score'],
        ascending=[True, False],
        inplace=True,
    )
    df[f'{name}_rank'] = df.groupby('user_id').cumcount() + 1

    mask = (df['lfm_rank'] < 101).to_numpy()
    eps: float = 0.001
    min_score: float = min(y_pred_scores) - eps
    df[f'{name}_hybrid_score'] = df[f'{name}_score'] * mask
    df[f'{name}_hybrid_score'].replace(
        0,
        min_score,
        inplace=True,
    )
    df[f'{name}_hybrid_rank'] = df[f'{name}_rank'] * mask
    max_rank: int = 101
    df[f'{name}_hybrid_rank'].replace(
        0,
        max_rank,
        inplace=True,
    )
    return df

In [171]:
columns_for_norm = ['user_avg_pop', 'user_last_pop', 'countries', 'age_rating', 'item_pop', 'item_avg_hist']
for column in columns_for_norm:
    if pd.api.types.is_categorical_dtype(ranker_train[column]):
        ranker_train[column] = ranker_train[column].cat.codes
    if pd.api.types.is_categorical_dtype(ranker_val[column]):
        ranker_val[column] = ranker_val[column].cat.codes
    if pd.api.types.is_categorical_dtype(ranker_test[column]):
        ranker_test[column] = ranker_test[column].cat.codes
    ranker_train[column] = normalize_column(ranker_train[column])
    ranker_val[column] = normalize_column(ranker_val[column])
    ranker_test[column] = normalize_column(ranker_test[column])

### XGBRanker

In [53]:
def get_group_xgb(df):
    return np.array(df['user_id'].value_counts())

In [62]:
params = {
    'learning_rate': 0.25,
    'max_depth': 4,
    'min_child_samples': 100,
    'n_estimators': 100,
    'num_leaves': 10,
    'objective': 'rank:ndcg',
    'random_state': 42,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'enable_categorical': True
}

fit_params = {
    'verbose': 4,
    'eval_metric': 'ndcg',
    'group': get_group_xgb(ranker_train),
    'X': ranker_train[cols],
    'y': ranker_train['target_ranker'],
    'eval_set': [(ranker_val[cols], ranker_val['target_ranker'])],
    'eval_group': [get_group_xgb(ranker_val)],
}
XGBRanker_model = XGBRanker(**params)

In [63]:
%%time
XGBRanker_model.fit(**fit_params)

[0]	validation_0-ndcg:0.64075
[4]	validation_0-ndcg:0.65002
[8]	validation_0-ndcg:0.65124
[12]	validation_0-ndcg:0.65284
[16]	validation_0-ndcg:0.65429
[20]	validation_0-ndcg:0.65476
[24]	validation_0-ndcg:0.65619
[28]	validation_0-ndcg:0.65618
[32]	validation_0-ndcg:0.65709
[36]	validation_0-ndcg:0.65838
[40]	validation_0-ndcg:0.65825
[44]	validation_0-ndcg:0.65879
[48]	validation_0-ndcg:0.65929
[52]	validation_0-ndcg:0.65977
[56]	validation_0-ndcg:0.66048
[60]	validation_0-ndcg:0.66085
[64]	validation_0-ndcg:0.66124
[68]	validation_0-ndcg:0.66167
[72]	validation_0-ndcg:0.66173
[76]	validation_0-ndcg:0.66200
[80]	validation_0-ndcg:0.66239
[84]	validation_0-ndcg:0.66262
[88]	validation_0-ndcg:0.66298
[92]	validation_0-ndcg:0.66314
[96]	validation_0-ndcg:0.66342
[99]	validation_0-ndcg:0.66357
CPU times: user 35min 45s, sys: 2min 52s, total: 38min 38s
Wall time: 4min 3s


In [64]:
y_pred = XGBRanker_model.predict(ranker_test[cols])

In [None]:
ranker_test = add_score_and_rank(ranker_test, y_pred, 'XGBRanker')
ranker_test.head(3)

In [69]:
models_metrics['XGBRanker'] = calc_metrics_(ranker_test, 'XGBRanker_rank')
pd.DataFrame(models_metrics)[['XGBRanker']]

Unnamed: 0,XGBRanker
Precision@10,0.027652
recall@10,0.094043
ndcg@10,0.033191
map@10,0.043666
novelty@10,3.43839


### LGBMRanker

In [172]:
def get_group_lgbm(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        ['item_id']
    )

In [173]:
params = {
    'objective': 'lambdarank',
    'n_estimators': 100,
    'max_depth': 4,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'random_state': 42,
}
early_stopping_rounds = 32
fit_params = {
    'X': ranker_train[cols],
    'y': ranker_train['target_ranker'],
    'group': get_group_lgbm(ranker_train),
    'eval_set': [(ranker_val[cols], ranker_val['target_ranker'])],
    'eval_group': [get_group_lgbm(ranker_val)],
    'eval_metric': 'ndcg',
    'eval_at': (3, 5, 10),
    'feature_name': cols,
}
LGBMRanker_model = LGBMRanker(**params)

In [174]:
%%time
LGBMRanker_model.fit(**fit_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.402720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1836
[LightGBM] [Info] Number of data points in the train set: 19263522, number of used features: 18
CPU times: user 8min 13s, sys: 21.1 s, total: 8min 34s
Wall time: 1min 44s


In [175]:
y_pred = LGBMRanker_model.predict(ranker_test[cols])

In [None]:
ranker_test = add_score_and_rank(ranker_test, y_pred, 'LGBMRanker')
ranker_test.head(3)


In [177]:
models_metrics['LGBMRanker'] = calc_metrics_(ranker_test, 'LGBMRanker_rank')
pd.DataFrame(models_metrics)[['LGBMRanker']]

Unnamed: 0,LGBMRanker
Precision@10,0.028525
recall@10,0.095505
ndcg@10,0.034336
map@10,0.045088
novelty@10,3.429369


### CatBoostRanker

In [82]:
cat_cols = [
    'age', 'income', 'sex', 'kids_flg',
    'content_type', 'directors_cat'
]

In [83]:
params = {
    'learning_rate': 0.25,
    'n_estimators': 50,
    'depth': 4,
    'reg_lambda': 1,
    'verbose': 1,
    'random_seed': 42,
    'custom_metric': 'NDCG:top=10',
    'early_stopping_rounds': early_stopping_rounds,
}

fit_params = {
    'X': Pool(data=ranker_train[cols],
              label=ranker_train['target_ranker'],
              group_id=ranker_train['user_id'].values,
              cat_features=cat_cols),
    'plot': 1,
    'early_stopping_rounds': early_stopping_rounds,
    'eval_set': Pool(data=ranker_val[cols],
                     label=ranker_val['target_ranker'],
                     group_id=ranker_val['user_id'].values,
                     cat_features=cat_cols),
}
CatBoostRanker_model = CatBoostRanker(**params)

In [84]:
%%time
CatBoostRanker_model.fit(**fit_params)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.6351296	best: 0.6351296 (0)	total: 13.9s	remaining: 11m 19s
1:	test: 0.6697635	best: 0.6697635 (1)	total: 23.5s	remaining: 9m 24s
2:	test: 0.6796049	best: 0.6796049 (2)	total: 32.7s	remaining: 8m 33s
3:	test: 0.6898546	best: 0.6898546 (3)	total: 44.8s	remaining: 8m 35s
4:	test: 0.6902335	best: 0.6902335 (4)	total: 53s	remaining: 7m 57s
5:	test: 0.6907766	best: 0.6907766 (5)	total: 1m 2s	remaining: 7m 34s
6:	test: 0.6913762	best: 0.6913762 (6)	total: 1m 14s	remaining: 7m 38s
7:	test: 0.6919620	best: 0.6919620 (7)	total: 1m 24s	remaining: 7m 23s
8:	test: 0.6920047	best: 0.6920047 (8)	total: 1m 35s	remaining: 7m 13s
9:	test: 0.6930784	best: 0.6930784 (9)	total: 1m 45s	remaining: 7m 1s
10:	test: 0.6932288	best: 0.6932288 (10)	total: 1m 54s	remaining: 6m 45s
11:	test: 0.6933261	best: 0.6933261 (11)	total: 2m 3s	remaining: 6m 31s
12:	test: 0.7034360	best: 0.7034360 (12)	total: 2m 13s	remaining: 6m 20s
13:	test: 0.7035218	best: 0.703

<catboost.core.CatBoostRanker at 0x7f7ceedbbdc0>

In [85]:
y_pred = CatBoostRanker_model.predict(ranker_test[cols])

In [None]:
ranker_test = add_score_and_rank(ranker_test, y_pred, 'CatBoostRanker')
ranker_test.head(3)


In [87]:
models_metrics['CatBoostRanker'] = calc_metrics_(ranker_test, 'CatBoostRanker_rank')
pd.DataFrame(models_metrics)[['CatBoostRanker']]


Unnamed: 0,CatBoostRanker
Precision@10,0.027508
recall@10,0.092927
ndcg@10,0.032566
map@10,0.042256
novelty@10,3.331839


### Результирующий датафрейм

In [178]:
res_df = pd.DataFrame({'recall@10': [0.094, 0.095, 0.093], 'ndcg@10': [0.0331, 0.034, 0.0325],
                       'map@10': [0.0437, 0.045, 0.0422], 'time, s': [243, 104, 512]}, index=['XGBRanker', 'LBMRanker', 'CatBoostRanker'])


In [90]:
res_df


Unnamed: 0,recall@10,ndcg@10,map@10,"time, s"
XGBRanker,0.094,0.0331,0.0437,243
LBMRanker,0.0942,0.0335,0.0446,104
CatBoostRanker,0.093,0.0325,0.0422,512


### Инференс плюс сравнение моделей первого и второго уровней, для этого выберем модель второго уровня LBMRanker, так как он показал лучшие результаты

In [181]:
comp_df = pd.DataFrame({'recall@10': [0.074, 0.080, 0.095], 'ndcg@10': [0.026, 0.027, 0.034], 'map@10': [0.033, 0.035, 0.045]},
                       index=['LightFM', 'LBMRanker', 'CatBoostRanker'])
comp_df

Unnamed: 0,recall@10,ndcg@10,map@10
LightFM,0.074,0.026,0.033
LBMRanker,0.08,0.027,0.035
CatBoostRanker,0.095,0.034,0.045


In [184]:
ranker_train = ranker_train.reset_index(drop=True)
ranker_val = ranker_val.reset_index(drop=True)
ranker_test = ranker_test.reset_index(drop=True)

In [185]:
ranker_train.shape

(19263522, 24)

In [186]:
ranker_val.shape

(4138671, 24)

In [189]:
ranker_df = pd.concat([ranker_train, ranker_val])
ranker_df = pd.concat([ranker_df, ranker_test])

In [190]:
y_pred = LGBMRanker_model.predict(ranker_df[cols])
ranker_df = add_score_and_rank(ranker_df, y_pred, 'LGBMRanker')

In [191]:
ranker_df = ranker_df.sort_values(by='LGBMRanker_rank', ascending=True)

In [196]:
recos = ranker_df.groupby('user_id').head(10)
recos['item_id'] = recos['item_id'].astype('int')

In [197]:
recos = recos.groupby('user_id')['item_id'].agg(list).reset_index()
recos.head(3)

Unnamed: 0,user_id,item_id
0,3,"[15297, 10440, 9728, 12192, 3734, 13865, 4151,..."
1,11,"[15297, 9728, 10440, 12192, 13865, 3734, 7829,..."
2,14,"[15297, 10440, 9728, 3734, 16228, 13865, 4151,..."


In [199]:
pickle.dump(recos, open("/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/ranker_recos.pickle", "wb"))