# Библиотеки

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm

import os

os.environ['TRANSFORMERS_CACHE'] = './cache/'

In [None]:
import faiss

In [1]:
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl

from catboost import CatBoostRanker, Pool

from lemma import LemmaRecommender
from semantic import SemanticRecommender

# Вспомогательные данные

In [2]:
with open('lemma_rec_1e6.pickle', 'rb') as f:
    lex_rec: LemmaRecommender = pickle.load(f)

In [4]:
features = pl.read_parquet('./features.parquet', columns = ['video_id', 'v_pub_datetime']).sort('v_pub_datetime')
features = features.unique()
videos = pl.read_parquet('./videos.parquet', columns = ['video_id', 'video_title', 'v_pub_datetime']).sort('v_pub_datetime')
videos = videos.join(features, on='video_id', how='left')
videos = videos.filter((~pl.col('v_pub_datetime').is_null()) & (~pl.col('v_pub_datetime_right').is_null()))
videos = videos.sort('video_id')

In [5]:
automarkup = pl.read_parquet('./automarkup.parquet', columns=['video_id'])

In [6]:
video_ids = videos["video_id"].tail(1_000_000).to_list()
video_ids += automarkup["video_id"].to_list()
video_ids = sorted(list(set(video_ids)))
len(video_ids)

1381401

In [8]:
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru").to(device)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [9]:
sem_rec = SemanticRecommender(videos.filter(pl.col('video_id').is_in(video_ids))["video_id"].to_list(), faiss.read_index('labse_candidates.index'),  tokenizer, model)

In [11]:
gc.collect()

1436

In [12]:
with open('I_query_labse_768.pickle', 'rb') as f:
    I = pickle.load(f)
    
with open('D_query_labse_768.pickle', 'rb') as f:
    D = pickle.load(f)

# Данные

In [63]:
features = pl.read_parquet('./features_nov.parquet')  # .sort('report_date')
features.head()

video_id,report_date,v_channel_reg_datetime,v_channel_type,v_category,v_pub_datetime,total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,v_cr_click_like_7_days,v_cr_click_dislike_7_days,v_cr_click_vtop_7_days,v_cr_click_long_view_7_days,v_cr_click_comment_7_days,v_cr_click_like_30_days,v_cr_click_dislike_30_days,v_cr_click_vtop_30_days,v_cr_click_long_view_30_days,v_cr_click_comment_30_days,v_cr_click_like_1_days,v_cr_click_dislike_1_days,v_cr_click_vtop_1_days,v_cr_click_long_view_1_days,v_cr_click_comment_1_days
str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""video_29980905…","""2023-11-02""","""2023-08-07 09:…","""UGC""","""Разное""","""2023-08-19 17:…",0,1,0,0,0,0,0,66154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_15248440…","""2023-11-02""","""2023-09-29 10:…","""UGC""","""Разное""","""2023-10-05 13:…",0,2,2,1,1,0,0,248315,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
"""video_26882913…","""2023-11-02""","""2022-03-11 06:…","""UGC""","""Видеоигры""","""2023-04-25 13:…",0,3,0,0,0,0,0,935936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_30857849…","""2023-11-02""","""2022-03-22 11:…","""UGC""","""Авто-мото""","""2022-03-22 11:…",0,1,0,0,0,0,0,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_9173713""","""2023-11-02""","""2022-03-15 13:…","""UGC""","""Строительство …","""2023-05-15 13:…",0,1,0,0,0,0,0,170853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
for dtype, col in zip(features.dtypes, features.columns):
    if dtype == pl.Int64:
        features = features.with_columns(features[col].cast(pl.Int32))
    if dtype == pl.Float64:
        features = features.with_columns(features[col].cast(pl.Float32))

In [65]:
features.head()

video_id,report_date,v_channel_reg_datetime,v_channel_type,v_category,v_pub_datetime,total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,v_cr_click_like_7_days,v_cr_click_dislike_7_days,v_cr_click_vtop_7_days,v_cr_click_long_view_7_days,v_cr_click_comment_7_days,v_cr_click_like_30_days,v_cr_click_dislike_30_days,v_cr_click_vtop_30_days,v_cr_click_long_view_30_days,v_cr_click_comment_30_days,v_cr_click_like_1_days,v_cr_click_dislike_1_days,v_cr_click_vtop_1_days,v_cr_click_long_view_1_days,v_cr_click_comment_1_days
str,str,str,str,str,str,i32,i32,i32,i32,i32,i32,i32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""video_29980905…","""2023-11-02""","""2023-08-07 09:…","""UGC""","""Разное""","""2023-08-19 17:…",0,1,0,0,0,0,0,66154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_15248440…","""2023-11-02""","""2023-09-29 10:…","""UGC""","""Разное""","""2023-10-05 13:…",0,2,2,1,1,0,0,248315,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
"""video_26882913…","""2023-11-02""","""2022-03-11 06:…","""UGC""","""Видеоигры""","""2023-04-25 13:…",0,3,0,0,0,0,0,935936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_30857849…","""2023-11-02""","""2022-03-22 11:…","""UGC""","""Авто-мото""","""2022-03-22 11:…",0,1,0,0,0,0,0,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""video_9173713""","""2023-11-02""","""2022-03-15 13:…","""UGC""","""Строительство …","""2023-05-15 13:…",0,1,0,0,0,0,0,170853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
v_channel_reg_datetime = features["v_channel_reg_datetime"].to_pandas()
v_pub_datetime = features["v_pub_datetime"].to_pandas()

In [None]:
v_channel_reg_datetime_ind = (pd.to_datetime(v_channel_reg_datetime) - pd.Timestamp(year=2010, month=1, day=1)).dt.days
v_pub_datetime_ind = (pd.to_datetime(v_pub_datetime) - pd.Timestamp(year=2010, month=1, day=1)).dt.days

In [None]:
features = features.with_columns(v_channel_reg_datetime_ind=pl.Series(v_channel_reg_datetime_ind), v_pub_datetime_ind=pl.Series(v_pub_datetime_ind))

In [None]:
features = features.drop(['report_date', 'v_channel_reg_datetime', 'v_pub_datetime']).unique(subset='video_id', keep='last')
features.head()

In [20]:
automarkup = pl.read_parquet('./automarkup.parquet').sort('datetime')
automarkup

is_authorized,datetime,query,video_id,duration,position,watchtime,emotion,vtop,comment,channel,tv_show,season
bool,str,str,str,i64,f64,i64,i64,bool,i64,str,str,str
false,"""2023-05-03 00:…","""битва экстрасе…","""video_26455637…",4283080,2.0,0,0,,0,"""Телеканал ТНТ""","""Новая битва эк…",
false,"""2023-05-03 00:…",""" шоу импровеза…","""video_16872535…",2752000,1.0,52,0,,0,"""Импровизация""","""Импровизация""","""Сезон 1"""
false,"""2023-05-03 00:…","""Звезды в Африк…","""video_22077053…",5401240,2.0,2602,0,,0,"""Телеканал ТНТ""","""Новые звезды в…",
false,"""2023-05-03 00:…","""Женский камерн…","""video_34215629…",167234,10.0,0,0,,0,"""Мой Зеленоград…",,
false,"""2023-05-03 00:…","""«Отставник-3» …","""video_33416258…",5452000,2.0,5350,0,,0,"""НТВ""",,
false,"""2023-05-03 00:…","""бригада 2 сери…","""video_32064413…",3155356,1.0,2903,0,,0,"""Алексей Невски…",,
false,"""2023-05-03 00:…","""звезды в африк…","""video_29354050…",5597534,1.0,0,0,,0,"""Телеканал ТНТ""","""Новые звезды в…",
false,"""2023-05-03 00:…","""Битва экстрасе…","""video_8087432""",5489400,1.0,155,0,,0,"""Телеканал ТНТ""","""Экстрасенсы. Б…",
false,"""2023-05-03 00:…","""взрослые игры …","""video_8601875""",2568067,1.0,2512,0,,0,"""Первый канал""","""Мужское / Женс…","""Сезон"""
false,"""2023-05-03 00:…","""На западе ниче…","""video_30121652…",80014,27.0,41,0,,0,"""Спортивно-разв…",,


In [21]:
automarkup = automarkup.unique(subset='query', keep='last', maintain_order=True).tail(300_000).to_pandas()

In [22]:
automarkup['query_id'] = np.arange(len(automarkup))
automarkup

Unnamed: 0,is_authorized,datetime,query,video_id,duration,position,watchtime,emotion,vtop,comment,channel,tv_show,season,query_id
0,True,2023-10-05 09:11:26+03:00,музыка русских композиторов,video_29044766,3545014,9.0,3536,2,True,0,"Ах, какая музыка!",,,0
1,False,2023-10-05 09:11:32+03:00,04.10.2023 обзор лиги чемпионов,video_12889081,2837600,1.0,1993,0,,0,МАТЧ!,Обзоры матчей,,1
2,False,2023-10-05 09:11:36+03:00,эпичный форсайт,video_28975291,133462,1.0,1,0,,0,РЭУ им. Г.В. Плеханова,,,2
3,False,2023-10-05 09:11:37+03:00,ЭТО ВАГНЕР ШТИЛТ,video_31774411,145025,1.0,131,0,,0,Solar Fox,,,3
4,False,2023-10-05 09:11:43+03:00,Новые пацанки7 чЗвезды в Африке,video_33350644,6783800,1.0,0,0,,0,Телеканал ТНТ,Новые звезды в Африке,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,False,2023-11-03 23:59:34+03:00,Последний герой остаться семьёй,video_17629988,5899000,1.0,2,0,,0,Телеканал ТВ-3,Последний герой,Сезон 1,299995
299996,False,2023-11-03 23:59:42+03:00,форсаж 10,video_30618852,6254145,2.0,51,0,,0,70 млн. просмотров,,,299996
299997,False,2023-11-03 23:59:44+03:00,блондинка в шоколаде,video_7629393,771210,1.0,10,0,,0,GaYBrighTForuM,,,299997
299998,True,2023-11-03 23:59:48+03:00,standoff 2,video_25669614,887367,2.0,0,0,,0,Веля,,,299998


In [23]:
gc.collect()

114

# Построение датасета

In [36]:
labels = pl.from_pandas(automarkup[['query_id', 'video_id']]).with_columns(pl.col("query_id").cast(pl.Int32), label=pl.lit(1, pl.Float32))
labels.head()

query_id,video_id,label
i32,str,f32
0,"""video_29044766…",1.0
1,"""video_12889081…",1.0
2,"""video_28975291…",1.0
3,"""video_31774411…",1.0
4,"""video_33350644…",1.0


In [37]:
datetime_ind = (pd.to_datetime(automarkup["datetime"]).astype('datetime64[ns]') - pd.Timestamp(year=2010, month=1, day=1)).dt.days
ind_df = pl.DataFrame([
    pl.Series('query_id', list(automarkup["query_id"]), pl.Int32),
    pl.Series('datetime_ind', list(datetime_ind), pl.Int32),
])
ind_df

  datetime_ind = (pd.to_datetime(automarkup["datetime"]).astype('datetime64[ns]') - pd.Timestamp(year=2010, month=1, day=1)).dt.days


query_id,datetime_ind
i32,i32
0,5025
1,5025
2,5025
3,5025
4,5025
5,5025
6,5025
7,5025
8,5025
9,5025


In [104]:
# merged = lex_rec.predict(automarkup['query'], 100)
merged = pl.read_parquet('lex_rec.parquet')

In [105]:
merged = merged.join(sem_rec.predict(I, D), on=['query_id', 'video_id'], how='outer')

In [106]:
merged = merged.join(labels, on=['query_id', 'video_id'], how='left')
merged = merged.fill_null(0)

In [107]:
merged = merged.join(ind_df, on=['query_id'], how='left')

In [108]:
gc.collect()

431

In [109]:
merged = merged.join(features, on=['video_id'], how='left')
merged = merged.with_columns(ind_diff=pl.col('datetime_ind') - pl.col('v_pub_datetime_ind'))
merged = merged.fill_null(-1)

In [110]:
print(merged.columns, merged, merged['label'].sum() / len(automarkup))

['query_id', 'video_id', 'lex_score', 'lex_rank', 'sem_score', 'sem_rank', 'label', 'datetime_ind', 'v_channel_type', 'v_category', 'total_comments', 'v_year_views', 'v_month_views', 'v_week_views', 'v_day_views', 'v_likes', 'v_dislikes', 'v_duration', 'v_cr_click_like_7_days', 'v_cr_click_dislike_7_days', 'v_cr_click_vtop_7_days', 'v_cr_click_long_view_7_days', 'v_cr_click_comment_7_days', 'v_cr_click_like_30_days', 'v_cr_click_dislike_30_days', 'v_cr_click_vtop_30_days', 'v_cr_click_long_view_30_days', 'v_cr_click_comment_30_days', 'v_cr_click_like_1_days', 'v_cr_click_dislike_1_days', 'v_cr_click_vtop_1_days', 'v_cr_click_long_view_1_days', 'v_cr_click_comment_1_days', 'v_channel_reg_datetime_ind', 'v_pub_datetime_ind', 'ind_diff'] shape: (55_536_124, 36)
┌──────────┬──────────┬───────────┬──────────┬───┬────────────┬────────────┬────────────┬──────────┐
│ query_id ┆ video_id ┆ lex_score ┆ lex_rank ┆ … ┆ v_cr_click ┆ v_channel_ ┆ v_pub_date ┆ ind_diff │
│ ---      ┆ ---      ┆ ---  

# Обучение ранкера

In [111]:
merged = merged.sort('query_id').to_pandas()
train_df = merged[merged.query_id < 200_000]
val_df = merged[merged.query_id >= 200_000]
val_df

Unnamed: 0,query_id,video_id,lex_score,lex_rank,sem_score,sem_rank,label,datetime_ind,v_channel_type,v_category,total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,v_cr_click_like_7_days,v_cr_click_dislike_7_days,v_cr_click_vtop_7_days,v_cr_click_long_view_7_days,v_cr_click_comment_7_days,v_cr_click_like_30_days,v_cr_click_dislike_30_days,v_cr_click_vtop_30_days,v_cr_click_long_view_30_days,v_cr_click_comment_30_days,v_cr_click_like_1_days,v_cr_click_dislike_1_days,v_cr_click_vtop_1_days,v_cr_click_long_view_1_days,v_cr_click_comment_1_days,v_channel_reg_datetime_ind,v_pub_datetime_ind,ind_diff
37008014,200000,video_10084466,0.00000,0,0.684040,1,0.0,5047,ТВ И СМИ,Люди и блоги,0,150,15,3,1,0,0,159289,0.000000,0.0,0.0,1.000000,0.0,0.000000,0.0,0.0,0.866667,0.0,0.000000,0.0,0.0,1.000000,0.0,1746,2782,2265
37008015,200000,video_7692001,0.00000,0,0.667560,2,0.0,5047,UGC,Люди и блоги,0,1,0,0,0,0,0,1337263,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,2324,2730,2317
37008016,200000,video_7947351,0.00000,0,0.663303,3,0.0,5047,ТВ И СМИ,Телепередачи,0,3,0,0,0,0,0,4263023,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,1977,2497,2550
37008017,200000,video_8225001,0.00000,0,0.648002,4,0.0,5047,UGC,Юмор,0,1,0,0,0,0,0,304610,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,918,-248,5295
37008018,200000,video_8205725,0.00000,0,0.613291,5,0.0,5047,UGC,Люди и блоги,0,1,1,0,0,0,0,168736,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,1382,1382,3665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55536119,299999,video_9104812,0.00000,0,0.636250,97,0.0,5054,UGC,Сериалы,0,267,22,2,1,0,0,445931,0.000000,0.0,0.0,0.500000,0.0,0.000000,0.0,0.0,0.545455,0.0,0.000000,0.0,0.0,0.000000,0.0,1369,1487,3567
55536120,299999,video_3284663,0.00000,0,0.635740,98,0.0,5054,ТВ И СМИ,Телепередачи,4,2547,324,99,17,26,0,1675009,0.010101,0.0,0.0,0.565657,0.0,0.006173,0.0,0.0,0.558642,0.0,0.058824,0.0,0.0,0.470588,0.0,1977,4956,98
55536121,299999,video_15383894,0.00000,0,0.634490,99,0.0,5054,UGC,Фильмы,0,131,18,0,0,0,0,140800,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.777778,0.0,0.000000,0.0,0.0,0.000000,0.0,4852,4910,144
55536122,299999,video_9956117,0.00000,0,0.633999,100,0.0,5054,UGC,Сад и огород,0,20,20,4,1,0,0,65766,0.000000,0.0,0.0,1.000000,0.0,0.000000,0.0,0.0,0.800000,0.0,0.000000,0.0,0.0,1.000000,0.0,4208,5023,31


In [112]:
train_df.to_parquet('train_df.parquet')
val_df.to_parquet('val_df.parquet')

In [24]:
train_df = pd.read_parquet('train_df.parquet')
val_df = pd.read_parquet('val_df.parquet')

In [27]:
cat_features = ['v_channel_type', 'v_category']

In [28]:
train_pool = Pool(
    data=train_df.drop(['query_id', 'video_id', 'label'], axis=1),
    label=train_df['label'],
    group_id=train_df['query_id'],
    cat_features=cat_features
)

val_pool = Pool(
    data=val_df.drop(['query_id', 'video_id', 'label'], axis=1),
    label=val_df['label'],
    group_id=val_df['query_id'],
    cat_features=cat_features
)

In [31]:
params = {
    'task_type': 'GPU',
    'loss_function': 'YetiRank',
    'eval_metric': 'NDCG:top=5',
    'iterations': 512,
    # 'learning_rate': 0.01,
}

In [32]:
model_cb = CatBoostRanker(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=False, verbose=10, use_best_model=True)

Groupwise loss function. OneHotMaxSize set to 10


Default metric period is 5 because PFound, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6285417	best: 0.6285417 (0)	total: 1.7s	remaining: 14m 29s
10:	test: 0.7068445	best: 0.7068445 (10)	total: 13.8s	remaining: 10m 28s
20:	test: 0.7266498	best: 0.7266498 (20)	total: 25.9s	remaining: 10m 6s
30:	test: 0.7332181	best: 0.7332181 (30)	total: 38s	remaining: 9m 49s
40:	test: 0.7378937	best: 0.7378937 (40)	total: 50s	remaining: 9m 34s
50:	test: 0.7423039	best: 0.7423039 (50)	total: 1m 2s	remaining: 9m 20s
60:	test: 0.7431685	best: 0.7431685 (60)	total: 1m 14s	remaining: 9m 7s
70:	test: 0.7539244	best: 0.7539244 (70)	total: 1m 26s	remaining: 8m 54s
80:	test: 0.7566736	best: 0.7566736 (80)	total: 1m 37s	remaining: 8m 41s
90:	test: 0.7615779	best: 0.7615779 (90)	total: 1m 50s	remaining: 8m 29s
100:	test: 0.7660053	best: 0.7660053 (100)	total: 2m 2s	remaining: 8m 16s
110:	test: 0.7722546	best: 0.7722546 (110)	total: 2m 14s	remaining: 8m 4s
120:	test: 0.7741977	best: 0.7741977 (120)	total: 2m 26s	remaining: 7m 52s
130:	test: 0.7762364	best: 0.7762364 (130)	total: 2m 38s	re

<catboost.core.CatBoostRanker at 0x7f3a7243f7f0>

# Тестирование

In [33]:
np.max(model_cb.get_evals_result()['validation']['NDCG:top=5;type=Base']) / val_df['query_id'].nunique() * val_df['label'].sum()

0.34779991828675616

In [38]:
test = pl.from_pandas(val_df)

In [39]:
test = test.with_columns(
    score=pl.Series(model_cb.predict(val_pool))
)

test = (
    test
    .sort(by='score', descending=True)
    .groupby('query_id').agg(pl.col('video_id').head(5))
    .join(labels, on='query_id', how='outer')
    .select('query_id', 'video_id', 'video_id_right')
)

In [40]:
test = test.to_pandas()

In [41]:
p, q = 0, 0
for preds, label in zip(test.video_id, test.video_id_right):
    if preds is not None:
        p += label in preds
    q += 1
p / q # 0.04955

0.09693666666666667

In [42]:
p, q = 0, 0
for preds, label in zip(test.video_id, test.video_id_right):
    if preds is not None:
        for rank, pred in enumerate(preds):
            if pred == label:
                p += 1 / (rank + 1)
                break
    q += 1
p / q # 0.03422005555555579

0.07184116666666787

# Получение предсказаний для тестовой выборки

In [43]:
test_queries = pd.read_csv('./test_dataset_submission_queries.csv')
test_queries

Unnamed: 0,query
0,Битва сильнейших экстрасенсов 2023 смотреть | ...
1,битва сильнейших экстрасенсов 2023\nбитва силь...
2,"Экстрасенсы. Битва сильнейших, 4 выпуск"
3,супер стар 4 сезон
4,пять ночей с Фредди
...,...
1995,Шоу аватар 2023
1996,Макс
1997,смотреть мужское женское 2019
1998,шоу вована и лексуса


In [44]:
merged = lex_rec.predict(test_queries['query'], 100)

  0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
sem_recs = []
for i, query in enumerate(tqdm(test_queries['query'])):
    sem_recs.append(sem_rec.predict_one(query, 100))
    sem_recs[-1] = sem_recs[-1].with_columns(query_id=pl.lit(i, pl.Int32))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [46]:
merged = merged.join(pl.concat(sem_recs), on=['query_id', 'video_id'], how='outer')

In [47]:
merged = merged.fill_null(0)

In [48]:
merged = merged.with_columns(datetime_ind=pl.lit(5064, pl.Int32))

In [49]:
merged = merged.join(features, on=['video_id'], how='left')
merged = merged.with_columns(ind_diff=pl.col('datetime_ind') - pl.col('v_pub_datetime_ind'))
merged = merged.fill_null(-1)

In [50]:
merged = merged.sort('query_id')

In [51]:
merged

query_id,video_id,lex_score,lex_rank,sem_score,sem_rank,datetime_ind,v_channel_type,v_category,total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,v_cr_click_like_7_days,v_cr_click_dislike_7_days,v_cr_click_vtop_7_days,v_cr_click_long_view_7_days,v_cr_click_comment_7_days,v_cr_click_like_30_days,v_cr_click_dislike_30_days,v_cr_click_vtop_30_days,v_cr_click_long_view_30_days,v_cr_click_comment_30_days,v_cr_click_like_1_days,v_cr_click_dislike_1_days,v_cr_click_vtop_1_days,v_cr_click_long_view_1_days,v_cr_click_comment_1_days,v_channel_reg_datetime_ind,v_pub_datetime_ind,ind_diff
i32,str,f32,i8,f32,i8,i32,str,str,i32,i32,i32,i32,i32,i32,i32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i64,i64
0,"""video_7205638""",0.0,0,0.854891,1,5064,"""UGC""","""Телепередачи""",0,726,59,0,0,0,0,82416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050847,0.0,0.0,0.0,0.0,0.0,0.0,4753,4857,207
0,"""video_14611671…",0.0,0,0.854891,2,5064,"""UGC""","""Развлечения""",0,305,9,2,0,0,0,85886,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_9558708""",0.0,0,0.854891,3,5064,"""UGC""","""Телепередачи""",0,303,17,7,0,0,0,85886,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.411765,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_5123456""",0.0,0,0.854742,4,5064,"""UGC""","""Телепередачи""",0,382,24,10,4,0,0,75643,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_28238230…",0.0,0,0.848417,5,5064,"""UGC""","""Телепередачи""",0,452,71,2,0,0,0,76377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112676,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_30934264…",0.0,0,0.84822,6,5064,"""UGC""","""Сериалы""",0,158,3,0,0,0,0,73941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_7471681""",0.0,0,0.845654,7,5064,"""UGC""","""Телепередачи""",0,367,39,9,1,0,0,85886,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.282051,0.0,0.0,0.0,0.0,0.0,0.0,4753,4861,203
0,"""video_11083436…",0.0,0,0.842199,8,5064,"""UGC""","""Развлечения""",0,262,8,0,0,0,0,76377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4753,4859,205
0,"""video_14905715…",0.0,0,0.840221,9,5064,"""UGC""","""Телепередачи""",0,514,32,6,0,0,0,75643,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.3125,0.0,0.0,0.0,0.0,0.0,0.0,4753,4858,206
0,"""video_7471407""",0.0,0,0.831619,10,5064,"""UGC""","""Развлечения""",0,251,23,14,2,0,0,75643,0.0,0.0,0.0,0.785714,0.0,0.0,0.0,0.0,0.521739,0.0,0.0,0.0,0.0,0.5,0.0,4753,4852,212


In [52]:
test_pool = Pool(
    data=merged.to_pandas().drop(['query_id', 'video_id'], axis=1),
    cat_features=cat_features
)

In [53]:
merged = merged.with_columns(
    score=pl.Series(model_cb.predict(test_pool))
)

merged = (
    merged
    .sort(by='score', descending=True)
    .groupby('query_id').agg(pl.col('video_id').head(5))
    .select('query_id', 'video_id')
)

merged

query_id,video_id
i32,list[str]
1712,"[""video_7943626"", ""video_15682119"", … ""video_15382920""]"
128,"[""video_2114851"", ""video_26428756"", … ""video_27896582""]"
1096,"[""video_10854637"", ""video_25347986"", … ""video_31312854""]"
32,"[""video_914067"", ""video_9107345"", … ""video_9630055""]"
1928,"[""video_18764678"", ""video_25347986"", … ""video_18791835""]"
576,"[""video_10854637"", ""video_25347986"", … ""video_18764678""]"
824,"[""video_8128938"", ""video_1352018"", … ""video_768797""]"
320,"[""video_25964450"", ""video_12572940"", … ""video_13313121""]"
544,"[""video_9092991"", ""video_1153833"", … ""video_168145""]"
776,"[""video_3134715"", ""video_8297464"", … ""video_1048042""]"


In [54]:
top_video_id = automarkup["video_id"].value_counts().index[:5].to_list()

In [55]:
merged = merged.to_pandas().set_index('query_id')

In [56]:
for value in set(test_queries.index) - set(merged.index):
    merged.loc[value, 'video_id'] = top_video_id

In [57]:
merged

Unnamed: 0_level_0,video_id
query_id,Unnamed: 1_level_1
1712,"[video_7943626, video_15682119, video_5676856,..."
128,"[video_2114851, video_26428756, video_21987179..."
1096,"[video_10854637, video_25347986, video_1876467..."
32,"[video_914067, video_9107345, video_8359490, v..."
1928,"[video_18764678, video_25347986, video_1085463..."
...,...
535,"[video_33600375, video_15169729, video_1672843..."
999,"[video_22228869, video_25347986, video_7472050..."
39,"[video_4809676, video_31279616, video_3691126,..."
135,"[video_13463234, video_34293220, video_8581151..."


In [58]:
merged = merged.reset_index()

In [59]:
with open("submit_lex+sem+cb_gpu+f.csv", "w") as f:
    f.write('query,video_id\n')
    for query, video_id in zip(merged["query_id"].to_list(), merged["video_id"].to_list()):
        video_id = list(video_id) + top_video_id
        for video in video_id[:5]:
            f.write("query_" + str(query))
            f.write(',')
            f.write(str(video))
            f.write('\n')

# Сохранение модели

In [60]:
model_cb.save_model('model_cb.cbm')