In [3]:
import pandas as pd
import numpy as np

In [4]:
data_path = "data/intern_task.csv"

In [ ]:
df = pd.read_csv(data_path)

## Test/Train Split

In [6]:
unique_query_ids = df["query_id"].unique()

test_query_ids = np.random.choice(unique_query_ids, size=200, replace=False)

In [7]:
train_data = df[~df["query_id"].isin(test_query_ids)]
test_data = df[df["query_id"].isin(test_query_ids)]

X_train = train_data.drop(["rank", "query_id"], axis=1)
y_train = train_data["rank"]
X_test = test_data.drop(["rank", "query_id"], axis=1)
y_test = test_data["rank"]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((212575, 144), (22683, 144), (212575,), (22683,))

## Catboost Baseline Model

In [8]:
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score

In [9]:
train_pool = Pool(data=X_train, label=y_train, group_id=train_data["query_id"])
test_pool = Pool(data=X_test, label=y_test, group_id=test_data["query_id"])

In [51]:
model = CatBoostRanker(
    iterations=60,
    learning_rate=0.05,
    depth=6,
    loss_function="PairLogit",
    verbose=True,
    # logging_level="Info"
)

In [52]:
model.fit(train_pool)

0:	learn: 0.6904238	total: 2.65s	remaining: 2m 36s
1:	learn: 0.6880357	total: 5.33s	remaining: 2m 34s
2:	learn: 0.6859553	total: 8.36s	remaining: 2m 38s
3:	learn: 0.6844294	total: 11.4s	remaining: 2m 39s
4:	learn: 0.6827472	total: 14.1s	remaining: 2m 34s
5:	learn: 0.6812178	total: 16.8s	remaining: 2m 31s
6:	learn: 0.6796345	total: 19.5s	remaining: 2m 27s
7:	learn: 0.6783635	total: 22.2s	remaining: 2m 24s
8:	learn: 0.6771458	total: 24.9s	remaining: 2m 20s
9:	learn: 0.6758946	total: 27.5s	remaining: 2m 17s
10:	learn: 0.6744508	total: 30.2s	remaining: 2m 14s
11:	learn: 0.6733605	total: 32.8s	remaining: 2m 11s
12:	learn: 0.6723667	total: 35.6s	remaining: 2m 8s
13:	learn: 0.6714292	total: 38.4s	remaining: 2m 6s
14:	learn: 0.6704463	total: 41.3s	remaining: 2m 3s
15:	learn: 0.6695116	total: 44s	remaining: 2m 1s
16:	learn: 0.6679931	total: 46.9s	remaining: 1m 58s
17:	learn: 0.6668734	total: 50.3s	remaining: 1m 57s
18:	learn: 0.6658955	total: 53.1s	remaining: 1m 54s
19:	learn: 0.6653336	total: 

<catboost.core.CatBoostRanker at 0x7dc75cc63410>

In [63]:
predictions = model.predict(test_pool)

def calculate_ndcg(y_true, y_pred, k=10):
    ndcg = ndcg_score([y_true], [y_pred], k=k)
    return ndcg

ndcg_value = calculate_ndcg(y_test, predictions, k=None)
ndcg_value

0.9019567370041385

In [73]:
ndcg_score([[1, 2], [2, 4]], [[1, 3], [1, 2]], k=1)

1.0

In [55]:
predictions

array([ 0.07216183, -0.03240059, -0.45296262, ...,  0.13874227,
       -0.45417088, -0.43387921])