# Подготовка

## Импорты

In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRanker, Pool, cv
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit

import optuna
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


## Доп функции

In [None]:
def get_train_test_split(X_data, y_data, queries_data, test_size=0.1, val_size=0.2, random_state=1337):
    """Function for splitting data in train val and test subsamples according to their groups"""
    gss = GroupShuffleSplit(n_splits=2, test_size=test_size, random_state=random_state)
    
    fold, _ = gss.split(X_data, y_data, queries_data)
    train_val_index, test_index = fold[0], fold[1]
    
    gss = GroupShuffleSplit(n_splits=2, test_size=val_size, random_state=random_state)

    fold, _ = gss.split(X_data.iloc[train_val_index], y_data[train_val_index], queries_data[train_val_index])
    train_index, val_index = fold[0], fold[1]
    
    X_train, y_train, queries_train = X_data.iloc[train_val_index].iloc[train_index], y_data[train_val_index][train_index], queries_data[train_val_index][train_index]
    X_val, y_val, queries_val = X_data.iloc[train_val_index].iloc[val_index], y_data[train_val_index][val_index], queries_data[train_val_index][val_index]
    X_test, y_test, queries_test = X_data.iloc[test_index], y_data[test_index], queries_data[test_index]
    
    data = {"train" : (X_train, y_train, queries_train),
            "val" : (X_val, y_val, queries_val),
            "test" : (X_test, y_test, queries_test)}
    
    return data

## Данные

In [2]:
df = pd.read_csv("intern_task.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235258 entries, 0 to 235257
Columns: 146 entries, rank to feature_143
dtypes: float64(140), int64(6)
memory usage: 262.1 MB


In [3]:
df.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


In [4]:
# Дроп фичей, содержащих одно уникальное значение

const_columns = []
for col in df.columns[2:]:
    if df[col].nunique() < 2:
        const_columns.append(col)
print(const_columns)

['feature_64', 'feature_65', 'feature_72', 'feature_100']


In [5]:
# Дроп коротких сессий (меньше 5 записей)

print(df.shape)

small_query = []
for q in df["query_id"].unique():
    if sum(df["query_id"] == q) < 5:
        small_query.append(q)
        df = df[df["query_id"] != q]
print(small_query)
print(df.shape)

(235258, 146)
[5920, 8665, 9265, 10525, 11410, 14350, 20560, 22780, 23215, 25120, 25885, 26170, 26395, 26545, 28285]
(235214, 146)


In [32]:
# Подготовка выборок 

X_data = df.drop(["rank", "query_id"]+const_columns, axis=1)
y_data = df["rank"].values
queries_data = df["query_id"].values

# Категориальные признаки — те, которые имеют целочисленный тип данных
cat_features = X_data.columns[X_data.dtypes==np.int64]

num_queries = np.unique(queries_data).shape[0]
num_queries

1985

In [9]:
# Разбиение на тренировочную, валидационную и тестовую выборки

data = get_train_test_split(X_data, y_data, queries_data)

train_pool = Pool(
    data=data["train"][0],
    label=data["train"][1],
    group_id=data["train"][2],
    cat_features=list(cat_features.values)
)

val_pool = Pool(
    data=data["val"][0],
    label=data["val"][1],
    group_id=data["val"][2],
    cat_features=list(cat_features.values)
)

test_pool = Pool(
    data=data["test"][0],
    label=data["test"][1],
    group_id=data["test"][2],
    cat_features=list(cat_features.values)
)

print(train_pool.shape, val_pool.shape, test_pool.shape)
print(df.shape)

# Весь датасет, который будет использоваться в кросс-валидации при оптимизации гиперпараметров
full_pool = Pool(data=X_data, 
                 label=y_data,
                 group_id=queries_data,
                 cat_features=list(cat_features.values))

(170233, 140) (42073, 140) (22908, 140)
(235214, 146)


# Основная часть работы

## Оптимизация

In [None]:
# Шаблон оптимизационной функции, использованной при поиске гиперпараметров финальной модели
# Гиперпараметры оптимизируются на основе средних значений метрики NDCG@5 на 4-ех фолдах

def objective(trial):
    params = {
        "loss_function" : trial.suggest_categorical(
            "loss_function", ["YetiRank", "PairLogitPairwise", "QueryRMSE",]
        ),
        "n_estimators": 100,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "custom_metric" : ["NDCG:top=5"],
        "verbose" : 50,
        "random_seed" : 1337,
        "task_type" : "GPU",
        "depth" : trial.suggest_int("depth", 1, 8),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli",]
        ),
        "random_strength": trial.suggest_categorical("random_strength", [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),      
    }
    
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    display(params)
    
    cv_data = cv(
        params=params,
        pool=full_pool,
        fold_count=4,
        shuffle=True,
        partition_random_seed=1337,
        verbose=500,
    ) 
    
    clear_output()
    
    score = max(cv_data["test-NDCG:top=5;type=Base-mean"])

    with open("tried_params.txt", "a") as f:
        f.write("\n"+str(params)+"\n"+str(score)+".......")

    return score
    

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=3600*0.8)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

## Обучение финальной модели

In [26]:
# Параметры полученные в результате оптимизации 
params = {"loss_function": "YetiRank",
          "iterations" : 3000,
          "custom_metric" : ["NDCG", "NDCG:top=5", 'MAP:top=5'],
          "metric_period" : 20,
          "verbose" : 100,
          "random_seed" : 1337,
          "early_stopping_rounds" : 200,
          "use_best_model" : True,
          "task_type" : "GPU",
          "depth": 8,
          "bootstrap_type": "Bernoulli",
          "subsample": 0.856154694711653,
          "random_strength": 0.1,
          "min_data_in_leaf": 23,
          }

model_new = CatBoostRanker(**params)
model_new.fit(train_pool, eval_set=val_pool)

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=5 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7042865	best: 0.7042865 (0)	total: 25.6ms	remaining: 1m 16s
100:	test: 0.7720035	best: 0.7720035 (100)	total: 1.77s	remaining: 50.9s
200:	test: 0.7795233	best: 0.7796309 (199)	total: 3.51s	remaining: 48.9s
300:	test: 0.7828469	best: 0.7830154 (293)	total: 5.22s	remaining: 46.8s
400:	test: 0.7845281	best: 0.7847231 (397)	total: 6.93s	remaining: 45s
500:	test: 0.7859041	best: 0.7859682 (487)	total: 8.67s	remaining: 43.3s
600:	test: 0.7876305	best: 0.7876421 (586)	total: 10.4s	remaining: 41.5s
700:	test: 0.7879774	best: 0.7886002 (646)	total: 12.2s	remaining: 40s
800:	test: 0.7902057	best: 0.7902057 (800)	total: 13.9s	remaining: 38.2s
900:	test: 0.7912365	best: 0.7912370 (895)	total: 15.7s	remaining: 36.5s
1000:	test: 0.7914939	best: 0.7916668 (961)	total: 17.4s	remaining: 34.7s
1100:	test: 0.7924541	best: 0.7926448 (1077)	total: 19s	remaining: 32.8s
1200:	test: 0.7925361	best: 0.7929141 (1169)	total: 20.7s	remaining: 31s
1300:	test: 0.7929869	best: 0.7931028 (1289)	total: 22.3

<catboost.core.CatBoostRanker at 0x16f4dd41bd0>

# Финальный скор

In [27]:
model_new.score(test_pool, top=5)

0.6078357575832827

**nDCG@5** = 0.6078357575832827

Метрика считается с помощью встроенного метода CatBoost https://catboost.ai/en/docs/concepts/python-reference_catboostranker_score

In [35]:
scores = model_new.eval_metrics(data=test_pool, metrics=["NDCG", 'MAP', "AUC:type=Ranking"])

for score in scores:
    print(f"{score} = {np.mean(scores[score])}")

NDCG:type=Base = 0.80764914308891
MAP = 0.6404142577506393
AUC:type=Ranking = 0.6936036598046028
