## Финализация

In [21]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from notebooks.helpers import explore_data_modern, load_latest_params
from catboost import Pool,CatBoostClassifier
import json
from datetime import datetime
from sklearn.metrics import roc_auc_score, classification_report

Настройки отображения

In [6]:
pd.set_option('display.max_columns', 50)
plt.style.use('ggplot') 

Загрузка данных

In [7]:
try:
    X_TRAIN = pd.read_parquet('../data/datasets/cross.parquet')
    X_TEST = pd.read_parquet('../data/datasets/test.parquet')        
    print("✅ Данные загружены!")
    print(f"Train frame: {X_TRAIN.shape[0]} строк")
except Exception as e:
    print(f"❌ Ошибка: {e}")

✅ Данные загружены!
Train frame: 1385812 строк


In [8]:
display(explore_data_modern(X_TRAIN, 'Train'))


🔍 Анализ датафрейма: Train


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
client_id,object,1091884,0,0.0%,1338357431.1640964866
device_screen_resolution,object,4414,0,0.0%,1792x1120
brand_tier,object,3,0,0.0%,other
is_returning,int32,2,0,0.0%,0
visit_time_minutes,int64,1440,0,0.0%,1114
has_utm_keyword,int32,2,0,0.0%,1
utm_keyword_campaign_fill,object,1164,0,0.0%,qUcotcWimEOQiboVPcCx
utm_keyword_notset_fill,object,1143,0,0.0%,qUcotcWimEOQiboVPcCx
target,int32,2,0,0.0%,0
geo_city,object,2201,0,0.0%,Tula


In [34]:
def finalize_model(features, cat_features, best_params, model_name=''):
    """
    Финальная тренировка, тестирование и сохранение модели
    """
    
    base_dir = os.path.abspath('../data/models')
    model_dir = os.path.join(base_dir, f"{model_name}")
    os.makedirs(model_dir, exist_ok=True)
            
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = os.path.join(model_dir, f"model_{timestamp}.cbm")
    params_path = os.path.join(model_dir, f"params_{timestamp}.json")
    metrics_path = os.path.join(model_dir, f"metrics_{timestamp}.json")
    
    X_train = X_TRAIN[features]
    y_train = X_TRAIN['target']
    X_test = X_TEST[features]
    y_test = X_TEST['target']
    
    print("🔧 Финальные параметры модели:")
    for k, v in best_params.items():
        print(f"{k}: {v}")

    # 2. Обучение финальной модели
    print("\n🚀 Обучение финальной модели на всех тренировочных данных...")
    final_model = CatBoostClassifier(**best_params)
    final_model.fit(
        Pool(X_train[features], y_train, cat_features=cat_features),
        plot=True
    )
    
    # 3. Тестирование на отложенной выборке
    print("\n🧪 Тестирование на отложенной выборке...")
    test_pred = final_model.predict_proba(X_test[features])[:, 1]
    test_auc = roc_auc_score(y_test, test_pred)
    
    # Дополнительные метрики
    test_report = classification_report(
        y_test, 
        final_model.predict(X_test[features]),
        output_dict=True
    )
    
    # 4. Сохранение артефактов
    final_model.save_model(model_path)
        
    metadata = {
        'features': features,
        'cat_features': cat_features,
        'params': best_params,
        'performance': {
            'test_auc': float(test_auc),  # Преобразуем numpy.float64
            'classification_report': test_report,
            'timestamp': timestamp
        }
    }

    with open(params_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)

    with open(metrics_path, 'w', encoding='utf-8') as f:
        json.dump({
            'test_auc': float(test_auc),
            'classification_report': test_report
        }, f, indent=4, ensure_ascii=False)
    
    # 5. Отчет о результатах
    print(f"\n📊 Результаты на тестовых данных:")
    print(f"- ROC-AUC: {test_auc:.4f}")
    print("\n📝 Classification Report:")
    print(classification_report(y_test, final_model.predict(X_test[features])))
    
    print(f"\n💾 Сохраненные артефакты:")
    print(f"- Модель: {model_path}")
    print(f"- Параметры: {params_path}")
    print(f"- Метрики: {metrics_path}")
    
    return final_model, metadata


In [35]:
loads_params = load_latest_params()

In [36]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign', 'utm_keyword']

In [37]:
finalize_model(param_features, category_features, loads_params, 'base')

🔧 Финальные параметры модели:
iterations: 672
depth: 8
learning_rate: 0.08743604703974947
l2_leaf_reg: 4
random_seed: 42
task_type: GPU
devices: 0
auto_class_weights: Balanced
verbose: 0

🚀 Обучение финальной модели на всех тренировочных данных...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


🧪 Тестирование на отложенной выборке...

📊 Результаты на тестовых данных:
- ROC-AUC: 0.7041

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.60      0.75    340308
           1       0.03      0.69      0.06      6146

    accuracy                           0.60    346454
   macro avg       0.51      0.64      0.40    346454
weighted avg       0.97      0.60      0.73    346454


💾 Сохраненные артефакты:
- Модель: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\base\model_20250712_134303.cbm
- Параметры: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\base\params_20250712_134303_params.json
- Метрики: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\base\metrics_20250712_134303_metrics.json


(<catboost.core.CatBoostClassifier at 0x1c44b3c16a0>,
 {'features': ['utm_source',
   'utm_medium',
   'device_brand',
   'visit_number',
   'utm_campaign',
   'utm_keyword'],
  'cat_features': ['utm_source',
   'utm_medium',
   'device_brand',
   'utm_campaign',
   'utm_keyword'],
  'params': {'iterations': 672,
   'depth': 8,
   'learning_rate': 0.08743604703974947,
   'l2_leaf_reg': 4,
   'random_seed': 42,
   'task_type': 'GPU',
   'devices': '0',
   'auto_class_weights': 'Balanced',
   'verbose': 0},
  'performance': {'test_auc': 0.7040858148213516,
   'classification_report': {'0': {'precision': 0.9906830636965285,
     'recall': 0.5986665021098534,
     'f1-score': 0.7463289098918409,
     'support': 340308.0},
    '1': {'precision': 0.03004112011476702,
     'recall': 0.688252521965506,
     'f1-score': 0.05756942695964016,
     'support': 6146.0},
    'accuracy': 0.6002557338059309,
    'macro avg': {'precision': 0.5103620919056477,
     'recall': 0.6434595120376797,
     'f1-

In [38]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'has_utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign']

In [39]:
finalize_model(param_features, category_features, loads_params, 'has_keyword')

🔧 Финальные параметры модели:
iterations: 672
depth: 8
learning_rate: 0.08743604703974947
l2_leaf_reg: 4
random_seed: 42
task_type: GPU
devices: 0
auto_class_weights: Balanced
verbose: 0

🚀 Обучение финальной модели на всех тренировочных данных...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


🧪 Тестирование на отложенной выборке...

📊 Результаты на тестовых данных:
- ROC-AUC: 0.7047

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.61      0.75    340308
           1       0.03      0.68      0.06      6146

    accuracy                           0.61    346454
   macro avg       0.51      0.64      0.41    346454
weighted avg       0.97      0.61      0.74    346454


💾 Сохраненные артефакты:
- Модель: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\has_keyword\model_20250712_134442.cbm
- Параметры: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\has_keyword\params_20250712_134442_params.json
- Метрики: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\has_keyword\metrics_20250712_134442_metrics.json


(<catboost.core.CatBoostClassifier at 0x1c44b3c15e0>,
 {'features': ['utm_source',
   'utm_medium',
   'device_brand',
   'visit_number',
   'utm_campaign',
   'has_utm_keyword'],
  'cat_features': ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign'],
  'params': {'iterations': 672,
   'depth': 8,
   'learning_rate': 0.08743604703974947,
   'l2_leaf_reg': 4,
   'random_seed': 42,
   'task_type': 'GPU',
   'devices': '0',
   'auto_class_weights': 'Balanced',
   'verbose': 0},
  'performance': {'test_auc': 0.7046656957118546,
   'classification_report': {'0': {'precision': 0.9906134136573153,
     'recall': 0.6068972812863641,
     'f1-score': 0.7526717529432558,
     'support': 340308.0},
    '1': {'precision': 0.0303627731671076,
     'recall': 0.6815815164334527,
     'f1-score': 0.05813574258731117,
     'support': 6146.0},
    'accuracy': 0.6082221593631478,
    'macro avg': {'precision': 0.5104880934122115,
     'recall': 0.6442393988599084,
     'f1-score': 0.4054037477652

In [40]:
param_features = ['utm_source', 'utm_medium', 'device_brand', 'is_returning', 'utm_campaign', 'has_utm_keyword']
category_features = ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign']

In [41]:
finalize_model(param_features, category_features, loads_params, 'is_returning')

🔧 Финальные параметры модели:
iterations: 672
depth: 8
learning_rate: 0.08743604703974947
l2_leaf_reg: 4
random_seed: 42
task_type: GPU
devices: 0
auto_class_weights: Balanced
verbose: 0

🚀 Обучение финальной модели на всех тренировочных данных...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


🧪 Тестирование на отложенной выборке...

📊 Результаты на тестовых данных:
- ROC-AUC: 0.7004

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.58      0.73    340308
           1       0.03      0.70      0.06      6146

    accuracy                           0.58    346454
   macro avg       0.51      0.64      0.39    346454
weighted avg       0.97      0.58      0.72    346454


💾 Сохраненные артефакты:
- Модель: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\is_returning\model_20250712_134756.cbm
- Параметры: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\is_returning\params_20250712_134756_params.json
- Метрики: F:\Projects\tgu\hahaton25_2\sberautopodpiska\data\models\is_returning\metrics_20250712_134756_metrics.json


(<catboost.core.CatBoostClassifier at 0x1c44b3c0a40>,
 {'features': ['utm_source',
   'utm_medium',
   'device_brand',
   'is_returning',
   'utm_campaign',
   'has_utm_keyword'],
  'cat_features': ['utm_source', 'utm_medium', 'device_brand', 'utm_campaign'],
  'params': {'iterations': 672,
   'depth': 8,
   'learning_rate': 0.08743604703974947,
   'l2_leaf_reg': 4,
   'random_seed': 42,
   'task_type': 'GPU',
   'devices': '0',
   'auto_class_weights': 'Balanced',
   'verbose': 0},
  'performance': {'test_auc': 0.7003967051500954,
   'classification_report': {'0': {'precision': 0.9907898240871474,
     'recall': 0.5794339245624552,
     'f1-score': 0.7312298475692854,
     'support': 340308.0},
    '1': {'precision': 0.029253569369552684,
     'recall': 0.701757240481614,
     'f1-score': 0.05616580175933221,
     'support': 6146.0},
    'accuracy': 0.5816039070121863,
    'macro avg': {'precision': 0.5100216967283501,
     'recall': 0.6405955825220346,
     'f1-score': 0.393697824664

In [4]:
del X_TRAIN, X_TEST