ранжирование

In [None]:
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split

# Пример данных
# df: ['user_id', 'item_id', 'query_id', 'feature1', 'feature2', 'relevance']
# group_id: 'query_id'
# cat_features: ['user_id', 'item_id']

df = pd.read_csv('l2r_data.csv')

cat_features = ['user_id', 'item_id']
group_id = 'query_id'
target = 'relevance'

X = df.drop(columns=[target, group_id])
y = df[target]
groups = df[group_id]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
groups_train, groups_val = train_test_split(groups, test_size=0.2, random_state=42)

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features,
    group_id=groups_train
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_features,
    group_id=groups_val
)

model = CatBoostRanker(
    loss_function='YetiRank',
    eval_metric='NDCG',
    iterations=1000,
    early_stopping_rounds=50,
    verbose=100,
    random_seed=42
)

model.fit(train_pool, eval_set=val_pool, plot=False)

# Получить предсказания
preds = model.predict(X_val)

ранжирование на временных признаков

In [None]:
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.preprocessing import LabelEncoder

# Пример данных
# df: ['user_id', 'item_id', 'sales_lag_1', 'sales_ma_7', 'price_change', 'purchased', 'timestamp']
# group_id: 'user_id'

df = pd.read_csv('ts_l2r_data.csv')

# Сортировка по времени для корректного создания признаков в production
df = df.sort_values(['user_id', 'timestamp'])

cat_features = ['item_id']
group_id = 'user_id'
target = 'purchased'

X = df.drop(columns=[target, group_id, 'timestamp'])
y = df[target]
groups = df[group_id]

# Label encoding для категориальных признаков
le = LabelEncoder()
X[cat_features] = X[cat_features].apply(le.fit_transform)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
groups_train, groups_val = train_test_split(groups, test_size=0.2, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features, group_id=groups_train)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features, group_id=groups_val)

model = CatBoostRanker(
    loss_function='PairLogitPairwise',
    eval_metric='MAP',
    iterations=1000,
    early_stopping_rounds=50,
    verbose=100,
    random_seed=42
)

model.fit(train_pool, eval_set=val_pool, plot=False)

preds = model.predict(X_val)

ранжирование вр рядов

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from catboost import CatBoostRanker, Pool

# Предположим, df_ts — это датафрейм с признаками, вычисленными из временных рядов
# df_ts: ['ts_id', 'region', 'trend', 'volatility', 'acf1', 'entropy', 'importance_rank']

def extract_ts_features(ts: pd.Series) -> dict:
    """Извлекает признаки из временного ряда."""
    features = {}
    features['mean'] = ts.mean()
    features['std'] = ts.std()
    features['trend'] = np.polyfit(range(len(ts)), ts, 1)[0] if len(ts) > 1 else 0
    features['acf1'] = ts.autocorr(lag=1) if len(ts) > 1 else 0
    features['entropy'] = -sum(p * np.log(p + 1e-9) for p in (ts - ts.min() + 1e-9) / (ts.max() - ts.min() + 1e-8))
    return features

# Пример: df_ts с признаками из временных рядов
df_ts = pd.read_csv('ts_features.csv')

cat_features = ['region']
group_id = 'group_id'  # Например, день или квартал
target = 'importance_rank'

X = df_ts.drop(columns=[target, group_id, 'ts_id'])
y = df_ts[target]
groups = df_ts[group_id]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
groups_train, groups_val = train_test_split(groups, test_size=0.2, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features, group_id=groups_train)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features, group_id=groups_val)

model = CatBoostRanker(
    loss_function='YetiRank',
    eval_metric='NDCG',
    iterations=500,
    early_stopping_rounds=30,
    verbose=100,
    random_seed=42
)

model.fit(train_pool, eval_set=val_pool, plot=False)

preds = model.predict(X_val)

форкастинг

In [None]:
import pandas as pd
from catboost import CatBoostRanker, Pool

# df: ['series_id', 'lag_1', 'lag_2', 'ma_5', 'seasonal_feature', 'target', 'is_anomaly']
# group_id: 'series_id'

df = pd.read_csv('ts_forecasting_as_ltr.csv')

cat_features = ['series_id']
group_id = 'series_id'
target = 'is_anomaly'

X = df.drop(columns=[target, group_id])
y = df[target]
groups = df[group_id]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
groups_train, groups_val = train_test_split(groups, test_size=0.2, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features, group_id=groups_train)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features, group_id=groups_val)

model = CatBoostRanker(
    loss_function='PairLogit',
    eval_metric='AUC',
    iterations=1000,
    early_stopping_rounds=50,
    verbose=100,
    random_seed=42
)

model.fit(train_pool, eval_set=val_pool, plot=False)

preds = model.predict(X_val)

получение топ N для каждого

In [None]:
# Допустим, у нас есть X_test и groups_test (например, для сабмита)
# X_test: ['user_id', 'item_id', 'feature1', ...]
X_test = pd.read_csv('test_data.csv')
groups_test = X_test['user_id']

test_pool = Pool(
    data=X_test,
    cat_features=cat_features
)

# Предсказания
preds = model.predict(test_pool)

# Добавим предсказания в датафрейм
X_test['pred'] = preds

# Сортировка по group_id и pred (по убыванию)
X_test = X_test.sort_values([group_id, 'pred'], ascending=[True, False])

# Группировка и выбор топ-20 для каждого group_id
top_20_per_group = X_test.groupby(group_id).head(20)

# Сабмит: user_id -> список item_id
submission = top_20_per_group.groupby(group_id)['item_id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
submission.columns = [group_id, 'item_id_list']

сабмит


In [None]:
# Если сабмит ожидает: user_id, item_id (каждая строка — один item)
submission_long = top_20_per_group[['user_id', 'item_id']].reset_index(drop=True)

# Или если нужен один столбец: строка с item_id через пробел
submission = top_20_per_group.groupby('user_id')['item_id'].apply(
    lambda x: ' '.join(x.astype(str))
).reset_index()
submission.columns = ['user_id', 'item_id_list']

submission.to_csv('submission.csv', index=False)

# preds1 = model1.predict(test_pool)
# preds2 = model2.predict(test_pool)
# preds3 = model3.predict(test_pool)

# # Усреднение
# X_test['pred'] = (preds1 + preds2 + preds3) / 3


# или дата --- топ несколько 
# # Допустим, df_ts с признаками по дням
# # df_ts: ['date', 'item_id', 'feature1', 'feature2', 'importance_rank']

# X_test['pred'] = model.predict(test_pool)
# X_test = X_test.sort_values(['date', 'pred'], ascending=[True, False])
# top_20_per_date = X_test.groupby('date').head(20)

# submission = top_20_per_date.groupby('date')['item_id'].apply(
#     lambda x: ' '.join(x.astype(str))
# ).reset_index()
# submission.columns = ['date', 'top_20_items']