In [78]:
import pandas as pd
import numpy as np

import optuna
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score

In [18]:
cleaned_data = "data/cleaned_data.csv"
pca_20_features = "data/pca_20_features.csv"

In [79]:
cleaned_data_df = pd.read_csv(cleaned_data)
pca_20_features_df = pd.read_csv(pca_20_features)

In [80]:
data_df = cleaned_data_df

## Test/Train Split

In [81]:
def test_train_split(df, test_size=0.2):
    # drop top 6 longest queries from test set
    unique_query_ids = df.value_counts("query_id").sort_values(ascending=False)[6:].keys().values

    test_query_ids = np.random.choice(unique_query_ids, size=int(test_size * len(unique_query_ids)), replace=False)
    
    train = df[~df["query_id"].isin(test_query_ids)]
    test = df[df["query_id"].isin(test_query_ids)]
    
    return train, test

In [82]:
train_data, test_data = test_train_split(data_df, test_size=0.10)

X_train = train_data.drop(["rank", "query_id"], axis=1)
y_train = train_data["rank"]
X_test = test_data.drop(["rank", "query_id"], axis=1)
y_test = test_data["rank"]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((210584, 137), (24674, 137), (210584,), (24674,))

In [83]:
train_pool = Pool(data=X_train, label=y_train, group_id=train_data["query_id"])
test_pool = Pool(data=X_test, label=y_test, group_id=test_data["query_id"])

## Catboost Baseline Model

In [84]:
def calculate_ndcg_per_query(test_data, predictions, k=5):
    test_data_with_preds = test_data.copy()
    test_data_with_preds["preds"] = predictions

    ndcg_scores = []
    for query_id, group in test_data_with_preds.groupby("query_id"):
        y_true = group["rank"].tolist()
        y_pred = group["preds"].tolist()
        ndcg = ndcg_score([y_true], [y_pred], k=k)
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

In [85]:
def objective(trial):
    """
    "iterations", 100, 500
    "learning_rate", 0.01, 0.1
    "depth", 4, 10
    "l2_leaf_reg", 3, 10
    
    best:
    param >>> {'iterations': 477, 'learning_rate': 0.09038144015024499, 'depth': 5, 'l2_leaf_reg': 10}
    value: 0.5728842563444173
    """
    param = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 3, 10)
    }
    
    model = CatBoostRanker(loss_function="YetiRank", **param)
    
    model.fit(train_pool, eval_set=test_pool, verbose=False)
    
    predictions = model.predict(test_pool)
    mean_ndcg = calculate_ndcg_per_query(test_data, predictions, k=5)
    
    return mean_ndcg

In [87]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial

print(f"NDCG@5: {trial.value}")
print("Best hyperparameters:")
print(trial.params)

[I 2024-04-27 15:13:40,716] A new study created in memory with name: no-name-79ea1dc7-3f5d-4d05-b17f-d107e789a022
[I 2024-04-27 15:14:42,245] Trial 0 finished with value: 0.5475602654752842 and parameters: {'iterations': 314, 'learning_rate': 0.03409210259383675, 'depth': 8, 'l2_leaf_reg': 8}. Best is trial 0 with value: 0.5475602654752842.
[I 2024-04-27 15:15:37,371] Trial 1 finished with value: 0.5462465989412176 and parameters: {'iterations': 266, 'learning_rate': 0.035982763916144904, 'depth': 8, 'l2_leaf_reg': 10}. Best is trial 0 with value: 0.5475602654752842.
[I 2024-04-27 15:16:39,903] Trial 2 finished with value: 0.5290153597282474 and parameters: {'iterations': 388, 'learning_rate': 0.010607212682221714, 'depth': 5, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.5475602654752842.
[I 2024-04-27 15:17:16,362] Trial 3 finished with value: 0.5509663339468761 and parameters: {'iterations': 189, 'learning_rate': 0.06310529271532768, 'depth': 8, 'l2_leaf_reg': 7}. Best is trial 3

KeyboardInterrupt: 

In [77]:
# predictions = model.predict(test_pool)
# mean_ndcg = calculate_ndcg_per_query(test_data, predictions, k=5)
# print("Средний NDCG@5:", mean_ndcg)

Средний NDCG@5: 0.5681112085492219


In [27]:
ndcg_score([[1, 2], [2, 4]], [[1, 3], [1, 2]], k=1)

1.0

array([-0.17428641,  0.24795866, -0.07610835, ...,  0.18993445,
        0.30776963,  0.21973099])