## Кроссвалидация

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from notebooks.helpers import explore_data_modern, load_latest_params
from catboost import Pool,CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

Настройки отображения

In [2]:
pd.set_option('display.max_columns', 50)
plt.style.use('ggplot') 

Загрузка данных

In [3]:
try:
    X_CROSS = pd.read_parquet('../data/datasets/cross.parquet')
            
    print("✅ Данные загружены!")
    print(f"Cross validation frame: {X_CROSS.shape[0]} строк")
except Exception as e:
    print(f"❌ Ошибка: {e}")

✅ Данные загружены!
Cross validation frame: 1385812 строк


In [4]:
display(explore_data_modern(X_CROSS, 'Train'))


🔍 Анализ датафрейма: Train


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
client_id,object,1091884,0,0.0%,1338357431.1640964866
geo_country,object,155,0,0.0%,Russia
is_peak_hour,int64,2,0,0.0%,1
is_weekend,int32,2,0,0.0%,0
visit_day_week,int64,7,0,0.0%,4
visit_season,object,4,0,0.0%,winter
visit_month,int64,8,0,0.0%,12
frequent_visitor,int32,2,0,0.0%,0
brand_tier,object,3,0,0.0%,other
is_returning,int32,2,0,0.0%,0


In [5]:
def cross_validation(features, cat_features, best_params=None):
    print(f"Feature set {features}")
    default_params = {
            'iterations': 500,
            'random_seed': 42,
            'auto_class_weights': 'Balanced',
            'verbose': 0,
            'task_type': 'GPU',
            'devices': '0'
        }
    params = best_params if best_params else default_params
    print(params)
    X = X_CROSS[features]
    y = X_CROSS['target']
    # Настройки кросс-валидации
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Хранение результатов
    auc_scores = []
    iteration = 0
    for train_index, val_index in skf.split(X, y):
        iteration += 1
        # Разбиение данных с учетом стратификации
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
        # Создание Pool объектов (оптимизировано для CatBoost)
        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)
    
        # Обучение модели
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool)
    
        # Предсказание и оценка
        y_pred = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc)
        print(f"Фолд {iteration} ROC-AUC: {auc:.4f}")

    # Итоговые метрики
    print(f"\nСредний AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

In [6]:
loads_params = load_latest_params()

In [31]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign', 'utm_keyword']

In [32]:
cross_validation(param_features, category_features, loads_params)

Feature set ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword']
{'iterations': 672, 'depth': 8, 'learning_rate': 0.08743604703974947, 'l2_leaf_reg': 4, 'random_seed': 42, 'task_type': 'GPU', 'devices': '0', 'auto_class_weights': 'Balanced', 'verbose': 0}
Фолд 1 ROC-AUC: 0.7067
Фолд 2 ROC-AUC: 0.7130
Фолд 3 ROC-AUC: 0.7152
Фолд 4 ROC-AUC: 0.7105
Фолд 5 ROC-AUC: 0.7136

Средний AUC: 0.7118 ± 0.0030


In [23]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'has_utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign']

In [24]:
cross_validation(param_features, category_features, loads_params)

Feature set ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'has_utm_keyword']
{'iterations': 672, 'depth': 8, 'learning_rate': 0.08743604703974947, 'l2_leaf_reg': 4, 'random_seed': 42, 'task_type': 'GPU', 'devices': '0', 'auto_class_weights': 'Balanced', 'verbose': 0}
Фолд 1 ROC-AUC: 0.7066
Фолд 2 ROC-AUC: 0.7132
Фолд 3 ROC-AUC: 0.7145
Фолд 4 ROC-AUC: 0.7100
Фолд 5 ROC-AUC: 0.7134

Средний AUC: 0.7115 ± 0.0029


In [25]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'is_returning', 'utm_campaign', 'has_utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign']

In [26]:
cross_validation(param_features, category_features, loads_params)

Feature set ['utm_source', 'utm_medium', 'device_brand', 'is_returning', 'utm_campaign', 'has_utm_keyword']
{'iterations': 672, 'depth': 8, 'learning_rate': 0.08743604703974947, 'l2_leaf_reg': 4, 'random_seed': 42, 'task_type': 'GPU', 'devices': '0', 'auto_class_weights': 'Balanced', 'verbose': 0}
Фолд 1 ROC-AUC: 0.7038
Фолд 2 ROC-AUC: 0.7106
Фолд 3 ROC-AUC: 0.7111
Фолд 4 ROC-AUC: 0.7065
Фолд 5 ROC-AUC: 0.7098

Средний AUC: 0.7084 ± 0.0028


In [7]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword', 'visit_month']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign', 'utm_keyword']

In [8]:
cross_validation(param_features, category_features, loads_params)

Feature set ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword', 'visit_month']
{'iterations': 672, 'depth': 8, 'learning_rate': 0.08743604703974947, 'l2_leaf_reg': 4, 'random_seed': 42, 'task_type': 'GPU', 'devices': '0', 'auto_class_weights': 'Balanced', 'verbose': 0}
Фолд 1 ROC-AUC: 0.7331
Фолд 2 ROC-AUC: 0.7350
Фолд 3 ROC-AUC: 0.7354
Фолд 4 ROC-AUC: 0.7322
Фолд 5 ROC-AUC: 0.7356

Средний AUC: 0.7343 ± 0.0013


In [12]:
del X_CROSS