In [1]:
from dataset import DatasetBuilder

import numpy as np
from catboost import CatBoostRanker, cv

# sns.set_theme(rc={"figure.figsize": (25, 10)})

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
dataset_builder = DatasetBuilder("data")
dataset = dataset_builder.build_dataset()

Reading data...
Joining all data into one single table...
Processing categorical features...


In [3]:
print("Categorical features: " + ", ".join(dataset.category_features))
dataset.features.head()

Categorical features: source_system_tab, source_screen_name, source_type, city, gender, registered_via, genre_ids, language


Unnamed: 0,source_system_tab,source_screen_name,source_type,city,bd,gender,registered_via,registration_init_time,expiration_date,song_length,genre_ids,language
0,0,0,0,0,28,0,0,20120102,20171005,206471,0,0
215,0,0,0,0,28,0,0,20120102,20171005,187802,1,0
627,0,0,0,0,28,0,0,20120102,20171005,247803,2,0
1735,0,0,0,0,28,0,0,20120102,20171005,181115,1,0
2022,0,0,0,0,28,0,0,20120102,20171005,200713,3,0


In [4]:
print(f"Number of unique users: {np.unique(dataset.user_id).shape[0]}")

Number of unique users: 30755


In [5]:
dataset.features[dataset.category_features].nunique()

source_system_tab      10
source_screen_name     21
source_type            13
city                   21
gender                  3
registered_via          5
genre_ids             573
language               11
dtype: int64

In [6]:
parameters = {
    "iterations": 10,
    "custom_metric": ["NDCG", "QueryAUC:type=Ranking"],
    "random_seed": 7,
    "loss_function": "YetiRank",
    "allow_writing_files": False,
    "metric_period": 1,
    "one_hot_max_size": 20,
    "min_data_in_leaf": 10,
}

In [7]:
cv_result = cv(
    pool=dataset.to_pool(), params=parameters, fold_count=5, shuffle=True, partition_random_seed=7, verbose=True
)

0:	test: 0.3787997	best: 0.3787997 (0)	total: 1m 30s	remaining: 13m 33s
1:	test: 0.3828068	best: 0.3828068 (1)	total: 3m 29s	remaining: 13m 59s
2:	test: 0.3823769	best: 0.3828068 (1)	total: 5m 23s	remaining: 12m 33s
3:	test: 0.3831045	best: 0.3831045 (3)	total: 6m 47s	remaining: 10m 11s
4:	test: 0.3854685	best: 0.3854685 (4)	total: 8m 15s	remaining: 8m 15s
5:	test: 0.3856250	best: 0.3856250 (5)	total: 9m 41s	remaining: 6m 27s
6:	test: 0.3858603	best: 0.3858603 (6)	total: 11m 9s	remaining: 4m 47s
7:	test: 0.3880930	best: 0.3880930 (7)	total: 12m 33s	remaining: 3m 8s
8:	test: 0.3886914	best: 0.3886914 (8)	total: 13m 55s	remaining: 1m 32s
9:	test: 0.3885338	best: 0.3886914 (8)	total: 15m 13s	remaining: 0us


In [8]:
cv_result

Unnamed: 0,iterations,test-PFound-mean,test-PFound-std,test-NDCG:type=Base-mean,test-NDCG:type=Base-std,test-QueryAUC:type=Ranking-mean,test-QueryAUC:type=Ranking-std
0,0,0.3788,0.004614,0.762835,0.003058,0.517153,0.003661
1,1,0.382807,0.007363,0.764602,0.004668,0.51806,0.003454
2,2,0.382377,0.007275,0.764624,0.004677,0.518092,0.00357
3,3,0.383105,0.007843,0.764783,0.004832,0.51826,0.003653
4,4,0.385468,0.004826,0.765854,0.004063,0.518557,0.003352
5,5,0.385625,0.00465,0.765803,0.004044,0.518557,0.003299
6,6,0.38586,0.004594,0.765801,0.004066,0.518552,0.003232
7,7,0.388093,0.003785,0.766925,0.002331,0.519049,0.002685
8,8,0.388691,0.002862,0.766986,0.002266,0.519157,0.002752
9,9,0.388534,0.002958,0.766977,0.002236,0.519137,0.002643
