In [4]:
import pandas as pd 
import numpy as np 
import warnings 
import gc 
from catboost import CatBoostRanker, Pool 
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

Simple dara generation 

In [38]:
import numpy as np
import pandas as pd

np.random.seed(42)

N_QUERIES = 10
ITEMS_PER_QUERY = 15

rows = []
for q in range(N_QUERIES):
    for i in range(ITEMS_PER_QUERY):
        rows.append({
            "query_id": q,
            "item_id": f"item_{i}",
            "category": np.random.choice(["electronics", "books", "clothes"]),
            "price": np.random.uniform(10, 300),
            "popularity": np.random.randint(1, 100),
            "text": np.random.choice([
                "cheap smartphone",
                "wireless headphones",
                "bestselling novel",
                "fashionable jacket"
            ]),
            "target": np.random.choice([0, 1, 2], p=[0.6, 0.3, 0.1])
        })

df = pd.DataFrame(rows).sample(frac=1.0).reset_index(drop=True) # permuting rows 

In [39]:
df

Unnamed: 0,query_id,item_id,category,price,popularity,text,target
0,5,item_0,books,248.554163,54,bestselling novel,0
1,2,item_4,clothes,58.804402,32,fashionable jacket,0
2,6,item_7,electronics,255.013840,90,wireless headphones,1
3,1,item_13,electronics,32.324174,59,bestselling novel,0
4,9,item_8,clothes,99.255400,65,wireless headphones,2
...,...,...,...,...,...,...,...
145,1,item_1,electronics,88.691219,53,fashionable jacket,0
146,4,item_13,clothes,20.423259,25,wireless headphones,0
147,9,item_9,books,264.229545,57,cheap smartphone,0
148,9,item_2,electronics,13.964870,55,wireless headphones,1


In [40]:
class Cfg: 
    TARGET = 'target' 
    GROUP_ID = 'query_id' 
    TEXT_FEATURES = ['text']
    CAT_FEATURES = ['category', 'item_id'] 
    NUM_FEATURES = ['popularity', 'price'] 

    FEATURES = CAT_FEATURES + NUM_FEATURES + TEXT_FEATURES

    PARAMS = {
        'loss_function': 'YetiRank', 
        'eval_metric': 'NDCG:top=5', 
        'learning_rate': 1e-3, 
        'early_stopping_rounds': 50, 
        'iterations': 1000, 
        'depth': 5, 
        'verbose': 10, 
        'random_seed': 42
    }

In [41]:
train, test = train_test_split(df, train_size=.8, random_state=42)

In [42]:
train = train.sort_values('query_id').reset_index(drop=True)
test = test.sort_values('query_id').reset_index(drop=True)

In [43]:
train_pool = Pool(
    data = train[Cfg.FEATURES], 
    label = train[Cfg.TARGET], 
    group_id=train[Cfg.GROUP_ID], 
    cat_features=Cfg.CAT_FEATURES, 
    text_features=Cfg.TEXT_FEATURES
)
eval_pool = Pool(
    data = test[Cfg.FEATURES], 
    label = test[Cfg.TARGET], 
    group_id=test[Cfg.GROUP_ID], 
    cat_features=Cfg.CAT_FEATURES, 
    text_features=Cfg.TEXT_FEATURES
)

In [44]:
model = CatBoostRanker(**Cfg.PARAMS)

model.fit(train_pool, eval_set=eval_pool) 

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8202837	best: 0.8202837 (0)	total: 12.8ms	remaining: 12.7s
10:	test: 0.8764068	best: 0.8806574 (7)	total: 29.4ms	remaining: 2.64s
20:	test: 0.8394998	best: 0.8806574 (7)	total: 39.3ms	remaining: 1.83s
30:	test: 0.8126198	best: 0.8806574 (7)	total: 47ms	remaining: 1.47s
40:	test: 0.8126198	best: 0.8806574 (7)	total: 53.6ms	remaining: 1.25s
50:	test: 0.8126198	best: 0.8806574 (7)	total: 59.5ms	remaining: 1.11s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8806573596
bestIteration = 7

Shrink model to first 8 iterations.


<catboost.core.CatBoostRanker at 0x12b5d0e90>

Predicting

In [45]:
test['score'] = model.predict(test[Cfg.FEATURES])
top10 = (
    test
    .sort_values(["query_id", "score"], ascending=[True, False])
    .groupby("query_id")
    .head(10)
)

In [46]:
top10

Unnamed: 0,query_id,item_id,category,price,popularity,text,target,score
3,0,item_5,electronics,94.456451,59,bestselling novel,0,0.007556
1,0,item_10,clothes,208.14642,44,cheap smartphone,1,0.004637
2,0,item_7,clothes,295.136957,3,cheap smartphone,1,0.000777
0,0,item_12,books,133.295204,54,wireless headphones,0,-0.006883
6,1,item_5,electronics,185.728393,41,fashionable jacket,1,0.003088
4,1,item_2,electronics,50.868025,15,cheap smartphone,0,0.002789
5,1,item_1,electronics,88.691219,53,fashionable jacket,0,-0.001322
8,2,item_8,electronics,193.96189,2,cheap smartphone,0,0.002392
7,2,item_7,books,155.776917,54,bestselling novel,0,-0.004911
9,3,item_0,electronics,210.371944,39,wireless headphones,1,0.007305


# Data with timestamp

In [24]:
class Cfg: 
    TEMPORAL_SPLIT_RATIO = .6
    RANDOM_SEED = 42

    PARAMS = {
        'loss_function': 'YetiRank', 
        'eval_metric': 'NDCG:top=5', 
        'iterations': 1, 
        'depth': 2, 
        'early_stopping_rounds': 50,
        'learning_rate': .1, 
        'verbose': 1
    }

In [6]:
np.random.seed(Cfg.RANDOM_SEED)

In [7]:
N_USERS = 1_000
N_ITEMS = 500
N_INTERACTIONS = 50_000

START_DATE = pd.Timestamp("2024-01-01")
N_DAYS = 180

item_ids = [f"book_{i}" for i in range(N_ITEMS)]

item_categories = ["fantasy", "science", "history", "programming"]

item_texts = {
    "fantasy": [
        "epic fantasy adventure",
        "magic kingdom and dragons",
        "heroic quest in mystical lands"
    ],
    "science": [
        "popular science explained",
        "physics for beginners",
        "modern astronomy guide"
    ],
    "history": [
        "historical biography",
        "ancient civilizations",
        "world war documentary"
    ],
    "programming": [
        "python programming guide",
        "machine learning basics",
        "deep learning with pytorch"
    ],
}

items_meta = pd.DataFrame({
    "item_id": item_ids,
    "category": np.random.choice(item_categories, size=N_ITEMS)
})

items_meta["text"] = items_meta["category"].apply(
    lambda c: np.random.choice(item_texts[c])
)

user_ids = np.arange(N_USERS)

user_activity = np.random.choice(
    ["hot", "cold"],
    size=N_USERS,
    p=[0.3, 0.7]
)

users_meta = pd.DataFrame({
    "user_id": user_ids,
    "user_type": user_activity
})

rows = []

for _, user in users_meta.iterrows():
    user_id = user["user_id"]
    user_type = user["user_type"]

    if user_type == "hot":
        n_events = np.random.randint(30, 100)
    else:
        n_events = np.random.randint(1, 10)

    event_days = np.random.randint(0, N_DAYS, size=n_events)
    timestamps = START_DATE + pd.to_timedelta(event_days, unit="D")

    interacted_items = np.random.choice(
        item_ids,
        size=n_events,
        replace=True
    )

    for ts, item_id in zip(timestamps, interacted_items):
        rows.append({
            "timestamp": ts,
            "user_id": user_id,
            "item_id": item_id
        })

raw_df = pd.DataFrame(rows)
raw_df = raw_df.merge(items_meta, on="item_id", how="left")
raw_df = raw_df.sample(frac=1.0).reset_index(drop=True)

In [8]:
raw_df

Unnamed: 0,timestamp,user_id,item_id,category,text
0,2024-02-20,449,book_461,fantasy,heroic quest in mystical lands
1,2024-06-06,86,book_305,fantasy,heroic quest in mystical lands
2,2024-06-10,231,book_354,programming,deep learning with pytorch
3,2024-06-23,43,book_178,fantasy,heroic quest in mystical lands
4,2024-01-30,88,book_221,history,historical biography
...,...,...,...,...,...
24691,2024-01-18,551,book_343,science,physics for beginners
24692,2024-01-26,223,book_123,history,historical biography
24693,2024-02-08,380,book_429,programming,python programming guide
24694,2024-02-23,867,book_10,history,ancient civilizations


In [9]:
raw_df = raw_df.sort_values('timestamp')
split_idx = int(len(raw_df) * Cfg.TEMPORAL_SPLIT_RATIO)

train_interactions = raw_df.iloc[:split_idx]
test_interactions = raw_df.iloc[split_idx:]

In [10]:
train_interactions

Unnamed: 0,timestamp,user_id,item_id,category,text
20618,2024-01-01,731,book_232,programming,python programming guide
18361,2024-01-01,867,book_2,fantasy,epic fantasy adventure
5682,2024-01-01,296,book_319,science,modern astronomy guide
10901,2024-01-01,823,book_28,programming,machine learning basics
17711,2024-01-01,14,book_200,history,historical biography
...,...,...,...,...,...
11370,2024-04-17,634,book_128,programming,python programming guide
11647,2024-04-17,519,book_208,science,modern astronomy guide
11552,2024-04-17,697,book_107,programming,python programming guide
17015,2024-04-17,697,book_35,fantasy,magic kingdom and dragons


In [11]:
meta_data_train = {}
meta_data_train['text'] = train_interactions.groupby('item_id')['text'].unique().to_dict()
meta_data_train['category'] = train_interactions.groupby('item_id')['category'].unique().to_dict()
meta_data_train['pop'] = (train_interactions['item_id'].value_counts() / len(raw_df['item_id'].unique())).to_dict()

meta_data_test = {}
meta_data_test['text'] = test_interactions.groupby('item_id')['text'].unique().to_dict()
meta_data_test['category'] = test_interactions.groupby('item_id')['category'].unique().to_dict()
meta_data_test['pop'] = (test_interactions['item_id'].value_counts() / len(raw_df['item_id'].unique())).to_dict()

In [12]:
top_items_train = (
    train_interactions['item_id']
    .value_counts()
    .head(200)
    .index
    .to_list()
)
top_items_test = (
    test_interactions['item_id']
    .value_counts()
    .head(200)
    .index
    .to_list()
)

In [13]:
def prepare_data_for_rank(interactions, candidates, meta_data): 
    rows = [] 

    for user_id in interactions.user_id: 
        user_history = interactions[
            interactions.user_id == user_id
        ].item_id.unique() 

        for candidate in candidates: 
            rows.append(
                {
                    'user_id': user_id, 
                    'item_id': candidate, 
                    'relevance': int(item_id in user_history), 
                    'text': meta_data['text'][candidate][0],
                    'category': str(meta_data['category'][candidate][0]), 
                    'popularity': meta_data['pop'][candidate]
                }
            )

    return pd.DataFrame(rows).sort_values('user_id').reset_index(drop=True)

In [14]:
train = prepare_data_for_rank(interactions=train_interactions, candidates=top_items_train, meta_data=meta_data_train)
test = prepare_data_for_rank(interactions=test_interactions, candidates=top_items_test, meta_data=meta_data_test)

In [15]:
train

Unnamed: 0,user_id,item_id,relevance,text,category,popularity
0,0,book_117,0,modern astronomy guide,science,0.062
1,0,book_326,0,historical biography,history,0.066
2,0,book_152,0,epic fantasy adventure,fantasy,0.066
3,0,book_2,0,epic fantasy adventure,fantasy,0.066
4,0,book_486,0,ancient civilizations,history,0.066
...,...,...,...,...,...,...
2963395,999,book_134,1,ancient civilizations,history,0.062
2963396,999,book_109,1,historical biography,history,0.062
2963397,999,book_56,1,deep learning with pytorch,programming,0.062
2963398,999,book_271,1,python programming guide,programming,0.062


In [25]:
Cfg.CAT_FEATURES = ['category']
Cfg.TEXT = ['text'] 
Cfg.FEATURES = ['text', 'category', 'popularity']
Cfg.TARGET = 'relevance' 
Cfg.GROUP = 'user_id' 

In [17]:
train = train.sort_values(Cfg.GROUP).reset_index(drop=True) 
test = test.sort_values(Cfg.GROUP).reset_index(drop=True) 

In [18]:
train_pool = Pool(
    data = train[Cfg.FEATURES], 
    label = train[Cfg.TARGET], 
    group_id = train[Cfg.GROUP], # be clear with that parameter
    cat_features = Cfg.CAT_FEATURES,
    text_features = Cfg.TEXT
)
test_pool = Pool(
    data = test[Cfg.FEATURES], 
    label = test[Cfg.TARGET], 
    group_id = test[Cfg.GROUP], # be clear with that parameter
    cat_features = Cfg.CAT_FEATURES, 
    text_features = Cfg.TEXT
)

In [26]:
Cfg.PARAMS.update({
    'subsample': 1.0
})

model = CatBoostRanker(**Cfg.PARAMS)

model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 1.0000000	best: 1.0000000 (0)	total: 1m 17s	remaining: 0us

bestTest = 1
bestIteration = 0



<catboost.core.CatBoostRanker at 0x12b5c8860>

In [35]:
test['scores'] = model.predict(test_pool)

top10 = (
    test.sort_values(['user_id', 'scores'], ascending=[True, False])
    .groupby('user_id')
    .head(10)
)

In [37]:
top10

Unnamed: 0,user_id,item_id,relevance,text,category,popularity,scores
0,0,book_262,0,heroic quest in mystical lands,fantasy,0.042,0.0
1,0,book_42,0,popular science explained,science,0.062,0.0
2,0,book_320,0,python programming guide,programming,0.062,0.0
3,0,book_245,0,popular science explained,science,0.062,0.0
4,0,book_81,0,machine learning basics,programming,0.062,0.0
...,...,...,...,...,...,...,...
1975405,999,book_463,0,epic fantasy adventure,fantasy,0.042,0.0
1975406,999,book_329,0,machine learning basics,programming,0.042,0.0
1975407,999,book_376,0,python programming guide,programming,0.042,0.0
1975408,999,book_37,0,epic fantasy adventure,fantasy,0.042,0.0
