In [83]:
import pandas as pd
import numpy as np

In [84]:
data_path = "data/intern_task.csv"

In [85]:
df = pd.read_csv(data_path)

## Test/Train Split

In [86]:
unique_query_ids = df["query_id"].unique()

test_query_ids = np.random.choice(unique_query_ids, size=200, replace=False)

In [87]:
train_data = df[~df["query_id"].isin(test_query_ids)]
test_data = df[df["query_id"].isin(test_query_ids)]

X_train = train_data.drop(["rank", "query_id"], axis=1)
y_train = train_data["rank"]
X_test = test_data.drop(["rank", "query_id"], axis=1)
y_test = test_data["rank"]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((213394, 144), (21864, 144), (213394,), (21864,))

## Catboost Baseline Model

In [88]:
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score

In [89]:
train_pool = Pool(data=X_train, label=y_train, group_id=train_data["query_id"])
test_pool = Pool(data=X_test, label=y_test, group_id=test_data["query_id"])

In [90]:
model = CatBoostRanker(
    iterations=20,
    learning_rate=0.05,
    depth=6,
    loss_function="PairLogit",
    verbose=True,
    # logging_level="Info"
)

In [91]:
model.fit(train_pool)

0:	learn: 0.6903791	total: 3.14s	remaining: 59.7s
1:	learn: 0.6880120	total: 6.17s	remaining: 55.5s
2:	learn: 0.6859358	total: 9.22s	remaining: 52.2s
3:	learn: 0.6838785	total: 12.1s	remaining: 48.3s
4:	learn: 0.6822511	total: 15s	remaining: 44.9s
5:	learn: 0.6807422	total: 18s	remaining: 41.9s
6:	learn: 0.6791770	total: 21.1s	remaining: 39.1s
7:	learn: 0.6779365	total: 24.1s	remaining: 36.2s
8:	learn: 0.6768193	total: 27s	remaining: 33s
9:	learn: 0.6755665	total: 30s	remaining: 30s
10:	learn: 0.6741296	total: 32.9s	remaining: 26.9s
11:	learn: 0.6730714	total: 35.8s	remaining: 23.9s
12:	learn: 0.6721256	total: 38.5s	remaining: 20.8s
13:	learn: 0.6712193	total: 41.2s	remaining: 17.7s
14:	learn: 0.6702518	total: 44s	remaining: 14.7s
15:	learn: 0.6693308	total: 46.8s	remaining: 11.7s
16:	learn: 0.6678282	total: 49.5s	remaining: 8.73s
17:	learn: 0.6666478	total: 52.5s	remaining: 5.83s
18:	learn: 0.6656683	total: 55.3s	remaining: 2.91s
19:	learn: 0.6651423	total: 58.1s	remaining: 0us


<catboost.core.CatBoostRanker at 0x7dc75c9a8490>

In [92]:
predictions = model.predict(test_pool)

def calculate_ndcg(y_true, y_pred, k=10):
    ndcg = ndcg_score([y_true], [y_pred], k=k)
    return ndcg

ndcg_value = calculate_ndcg(y_test, predictions, k=None)
ndcg_value

0.9038129888198401

In [93]:
ndcg_score([[1, 2], [2, 4]], [[1, 3], [1, 2]], k=1)

1.0

In [94]:
predictions

array([-0.17428641,  0.24795866, -0.07610835, ...,  0.18993445,
        0.30776963,  0.21973099])