# Import&Settings

In [1]:
# !pip install catboost
# !pip install optuna

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m10.8 MB/s[0m

In [2]:
import pandas as pd
import numpy as np
from time import time
from catboost import CatBoostRanker, Pool
import optuna
from collections import Counter
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import StratifiedGroupKFold
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv('data/intern_task.csv')

# Preprocessing

Изменим нумерацию столбца query_id с 1.

In [5]:
def renumber_column(column):
    current_number = None
    new_column = []
    new_number = 0

    for item in column:
        if item != current_number:
            current_number = item
            new_number += 1
        new_column.append(new_number)

    return new_column

In [6]:
df['query_id'] = renumber_column(df['query_id'].to_list())
df.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,1,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,1,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,1,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,1,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,1,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


Удалим из датасета константные признаки.

In [7]:
const_cols = []

for idx, col in enumerate(df.drop(['rank', 'query_id'], axis=1).columns.to_list()):
    if df[f'{col}'].nunique() == 1:
        const_cols.append(col)

df.drop(const_cols, axis=1, inplace=True)

Удалим признаки с сильной корреляцией, поскольку признаки сильной корреляции могут увеличить время обучения модели, усложнить интерпретацию результатов и могут привести к переобучению модели. Поэтому удаление лишних признаков позволяет улучшить эффективность модели и снизить риск переобучения.


In [8]:
def select_corr_columns(data, thresh=0.95):
    corr_matrix = data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    data_corr_col = [column for column in upper.columns if any(upper[column] > thresh)]
    return data_corr_col

In [9]:
corr_cols = select_corr_columns(df)
df.drop(columns=corr_cols, inplace=True)

Как видно из EDA, в датасете содержится много признаков с нулевыми значениями, удалим признаки, в которых более 85% нулей.

In [10]:
def columns_with_over_treshold_percent_zeros(df, threshold=0.85):
    columns_list = []
    for column in df.columns:
        if df[column][df[column] == 0].count() > df[column].shape[0] * threshold:
            columns_list.append(column)
    return columns_list

In [11]:
zero_cols = columns_with_over_treshold_percent_zeros(df.drop(['query_id', 'rank'], axis=1))
df.drop(columns=zero_cols, inplace=True)

Найдем в датасете колонки, которые можно отнести к категориальному типу.

In [12]:
def has_decimal(number):
    return number.is_integer()

In [13]:
cat_columns = []

for col in df.drop(['query_id', 'rank'], axis=1).columns.to_list():
    if df[f'{col}'].apply(has_decimal).all() and df[f'{col}'].nunique() <= 10:
        cat_columns.append(col)

num_columns = set(df.columns.to_list()) - set(cat_columns)

In [14]:
df[cat_columns] = df[cat_columns].astype(np.int64)

In [15]:
df.head()

Unnamed: 0,rank,query_id,feature_0,feature_2,feature_3,feature_5,feature_7,feature_9,feature_10,feature_11,...,feature_129,feature_130,feature_131,feature_132,feature_134,feature_135,feature_136,feature_137,feature_140,feature_141
0,0,1,1.0,1.0,3,0.333333,0.333333,1.0,10.0,0.0,...,153.0,3866.0,17.0,104.0,0.0,0.0,0.454545,0.890238,0.077778,0.002222
1,1,1,3.0,3.0,0,1.0,1.0,1.0,557.0,0.0,...,266.0,56137.0,5.0,2.0,0.0,0.0,0.0,0.773976,0.027826,0.00043
2,0,1,3.0,2.0,0,1.0,0.666667,1.0,522.0,0.0,...,541.0,12621.0,11.0,11.0,0.0,0.0,0.0,0.918308,0.014925,0.000104
3,1,1,3.0,3.0,0,1.0,1.0,1.0,59.0,0.0,...,14687.0,40205.0,5.0,3.0,0.0,0.0,0.0,0.975355,0.05314,0.000255
4,2,1,3.0,3.0,1,1.0,1.0,1.0,203.0,0.0,...,10577.0,34605.0,1.0,1.0,273.0,79.670665,0.2,0.990119,0.046512,0.000307


# Metrics

In [16]:
def ndcg_at_k(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    ideal_order = np.argsort(y_true)[::-1]
    ideal_y_true = np.take(y_true, ideal_order[:k])
    ideal_gains = 2 ** ideal_y_true - 1
    idcg = np.sum(ideal_gains / discounts)

    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg

# Modeling

Разделим датсет на тренировочную и тестовую выборку так, чтобы в выборку попали объекты с уникальным query_id.

In [17]:
tt_splitter = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
tt_split = tt_splitter.split(df, groups=df['query_id'])
train_inds, test_inds = next(tt_split)
train_df = df.iloc[train_inds]
test_df = df.iloc[test_inds]

In [18]:
X_train = train_df.drop(['rank', 'query_id'], axis=1)
y_train = train_df['rank']
train_id = train_df['query_id']

X_test = test_df.drop(['rank', 'query_id'], axis=1)
y_test = test_df['rank']
test_id = test_df['query_id']

In [19]:
def create_weights(queries):
    query_set = np.unique(queries)
    query_weights = np.random.uniform(size=query_set.shape[0])
    weights = np.zeros(shape=queries.shape)

    for i, query_id in enumerate(query_set):
        weights[queries == query_id] = query_weights[i]

    return weights

train = Pool(
    data=X_train,
    label=y_train,
    feature_names=list(X_train.columns.values),
    cat_features=cat_columns,
    group_id=train_id,
    group_weight=create_weights(train_id)
)

test = Pool(
    data=X_test,
    label=y_test,
    feature_names=list(X_test.columns.values),
    cat_features=cat_columns,
    group_id=test_id,
    group_weight=create_weights(test_id)
)

Произведем подбор гиперпараметров с помощью optuna, выделив из тренировочной выборки валидационную часть. После чего на целом тренировочном датасете обучим модель и посчитаем метрики.

In [20]:
def objective(trial):
    train_copy = train_df.copy()
    tv_splitter = GroupShuffleSplit(n_splits=2, test_size=0.1, random_state=42)
    tv_split = tv_splitter.split(train_df, groups=train_copy['query_id'])
    train_ids, val_ids= next(tv_split)
    train = train_df.iloc[train_ids]
    val = train_df.iloc[val_ids]

    x_train = train.drop(['rank', 'query_id'], axis=1)
    y_train = train['rank']
    train_id = train['query_id']

    x_val = val.drop(['rank', 'query_id'], axis=1)
    y_val = val['rank']
    val_id = val['query_id']

    train_pool = Pool(
        data=x_train,
        label=y_train,
        feature_names=list(x_train.columns.values),
        cat_features=cat_columns,
        group_id=train_id,
        group_weight=create_weights(train_id)
    )

    val_pool = Pool(
        data=x_val,
        label=y_val,
        feature_names=list(x_val.columns.values),
        cat_features=cat_columns,
        group_id=val_id,
        group_weight=create_weights(val_id)
    )

    params = {
        'early_stopping_rounds': 100,
        'custom_metric': ['NDCG:top=5', 'PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5'],
        'eval_metric': 'NDCG:top=5',
        'metric_period': 20,
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'verbose': False
    }

    suggested_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        'loss_function': trial.suggest_categorical("loss_function", ['YetiRankPairwise', 'YetiRank', 'QueryRMSE', 'QuerySoftMax']),
        "depth": trial.suggest_int("depth", 5, 8),
        "subsample": trial.suggest_float("subsample", 0.4, 0.6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20)
    }

    params.update(suggested_params)
    model = CatBoostRanker(**params)
    model.fit(train_pool, eval_set=val_pool)
    return model.score(val_pool)

In [21]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-04-28 20:05:57,968] A new study created in memory with name: no-name-66da2d42-55b0-40ec-ade3-94f2d5349969
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=5 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=5 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric MAP:top=5 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
[I 2024-04-28 20:06:00,776] Trial 0 finished with value: 0.7617677988112355 and parameters: {'n_estimators': 28, 'learning_rate': 0.04672893312739545, 'loss_function': 'QueryRMSE', 'depth': 8, 'subsample': 0.42140079

In [22]:
model = CatBoostRanker(**study.best_params)
model.fit(train)

Groupwise loss function. OneHotMaxSize set to 10
0:	learn: 0.7658130	total: 127ms	remaining: 5.85s
1:	learn: 0.7632085	total: 224ms	remaining: 5.03s
2:	learn: 0.7610067	total: 335ms	remaining: 4.92s
3:	learn: 0.7588833	total: 433ms	remaining: 4.65s
4:	learn: 0.7569082	total: 528ms	remaining: 4.43s
5:	learn: 0.7549600	total: 631ms	remaining: 4.31s
6:	learn: 0.7533248	total: 730ms	remaining: 4.17s
7:	learn: 0.7518281	total: 823ms	remaining: 4.01s
8:	learn: 0.7503793	total: 922ms	remaining: 3.89s
9:	learn: 0.7491437	total: 1.02s	remaining: 3.76s
10:	learn: 0.7478019	total: 1.11s	remaining: 3.65s
11:	learn: 0.7466161	total: 1.22s	remaining: 3.54s
12:	learn: 0.7454338	total: 1.32s	remaining: 3.45s
13:	learn: 0.7444065	total: 1.42s	remaining: 3.35s
14:	learn: 0.7432270	total: 1.53s	remaining: 3.26s
15:	learn: 0.7423667	total: 1.63s	remaining: 3.15s
16:	learn: 0.7413360	total: 1.72s	remaining: 3.04s
17:	learn: 0.7402401	total: 1.82s	remaining: 2.94s
18:	learn: 0.7393835	total: 1.92s	remaining

<catboost.core.CatBoostRanker at 0x7b93cc34b8e0>

Чтобы убедиться в правильности рассчитанных метрик катбустом, рассчитаем с помощью самописных функций те же метрики.

In [23]:
predictions = pd.concat([test_id.reset_index(drop=True),
                        pd.DataFrame(model.predict(test),
                                     columns=['relevance']).reset_index(drop=True),
                        y_test.reset_index(drop=True)], axis=1)
predictions

Unnamed: 0,query_id,relevance,rank
0,24,-0.166182,0
1,24,-0.246101,0
2,24,-0.278578,0
3,24,-0.243083,0
4,24,-0.155659,0
...,...,...,...
46264,1991,0.412040,0
46265,1991,0.380807,1
46266,1991,-0.004008,1
46267,1991,0.118353,2


In [27]:
ndcg_5 = round(np.mean([ndcg_at_k(np.array(predictions[predictions['query_id'] == i]['rank']),
                                          np.array(predictions[predictions['query_id'] == i]['relevance'])) for i in np.array(predictions['query_id'])]), 3)
print(f"NDCG@5: {ndcg_5}")

NDCG@5: 0.809


# Conclusions

Была обучена модель CatboostRanker с подобранными с помощью optuna параметрами, а также рассчитана метрики NDCG@5 на тестовой выборке - финальное значение получилось равным 0.809.   
В качестве улучшений можно предложить:
- применить нейросетевой подход к решению задачи
- собрать ансамбль ранжирующих моделей
- заняться Feature Engineering, но для этого необходимо понимать природу полученных данных.