In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.ensemble import IsolationForest
import xgboost as xgb
import catboost as cb
import shap
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple, Dict, Optional
warnings.filterwarnings('ignore')

def train_catboost_ensemble(X_train, y_train, X_val, y_val, n_models=5, use_weights=True):

    models = {}
    val_predictions = {}
    seeds = [42, 123, 456, 789, 2024]  # Разные сиды для разнообразия

    # Преобразуем y в правильный формат
    if hasattr(y_train, 'values'):
        y_train = y_train.values.ravel()
    else:
        y_train = np.array(y_train).ravel()

    if hasattr(y_val, 'values'):
        y_val = y_val.values.ravel()
    else:
        y_val = np.array(y_val).ravel()

    print(f"\nTraining ensemble of {n_models} CatBoost models with different seeds...")
    print(f"Train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"Val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    print(f"Unique values in y_train: {np.unique(y_train)}")

    for i in range(n_models):
        model_name = f'catboost_seed_{seeds[i]}'
        print(f"\nTraining {model_name}...")

        try:
            # Базовые параметры для всех моделей
            base_params = {
                'iterations': 1000,
                'depth': 6 + (i % 3),  # Варьируем глубину: 6, 7, 8, 6, 7
                'learning_rate': 0.05 * (1 + 0.1 * (i - 2)),  # Вариации ±20%
                'l2_leaf_reg': 3 + i * 0.5,  # Увеличиваем регуляризацию

                'random_seed': seeds[i],
                'verbose': False,
                'eval_metric': 'Logloss',
                'task_type': 'CPU',  # Для GPU измените на 'GPU'



            }

            # Добавляем специфичные параметры в зависимости от типа bootstrap
            if i % 2 == 0:
                # Bayesian bootstrap с bagging_temperature
                base_params['bootstrap_type'] = 'Bayesian'
                base_params['bagging_temperature'] = 0.5 + i * 0.2  # 0.5, 0.7, 0.9, 1.1, 1.3
            else:
                # Bernoulli bootstrap с subsample
                base_params['bootstrap_type'] = 'Bernoulli'
                base_params['subsample'] = 0.7 + i * 0.05  # 0.75, 0.85, 0.95

            # Создаем и обучаем модель
            cb_model = cb.CatBoostClassifier(**base_params)

            cb_model.fit(
                X_train, y_train,
                eval_set=(X_val, y_val),
                early_stopping_rounds=100,
                verbose=False,
                plot=False
            )

            models[model_name] = cb_model

            # Получаем предсказания
            val_predictions[model_name] = cb_model.predict_proba(X_val)[:, 1]

            # Оценка модели
            f1 = f1_score(y_val, (val_predictions[model_name] > 0.5).astype(int))
            prec = precision_score(y_val, (val_predictions[model_name] > 0.5).astype(int))
            rec = recall_score(y_val, (val_predictions[model_name] > 0.5).astype(int))

            print(f"  {model_name}:")
            print(f"    F1-score: {f1:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")
            print(f"    Best iteration: {cb_model.get_best_iteration()}/{base_params['iterations']}")
            print(f"    Bootstrap: {base_params['bootstrap_type']}, Depth: {base_params['depth']}, LR: {base_params['learning_rate']:.3f}")

        except Exception as e:
            print(f"  Error training {model_name}: {str(e)}")
            continue

    if len(models) == 0:
        raise ValueError("No models were successfully trained!")

    # Вычисляем веса для ансамбля
    if use_weights:
        weights = {}
        for name, preds in val_predictions.items():
            f1 = f1_score(y_val, (preds > 0.5).astype(int))
            weights[name] = f1

        # Нормализуем веса
        total_weight = sum(weights.values())
        weights = {k: v/total_weight for k, v in weights.items()}
        print(f"\nEnsemble weights based on F1-score:")
        for name, weight in weights.items():
            print(f"  {name}: {weight:.4f}")
    else:
        # Равные веса
        weights = {name: 1/len(models) for name in models.keys()}
        print(f"\nUsing equal weights: {1/len(models):.4f} for each model")

    # Оценка ансамбля
    ensemble_preds = ensemble_predict_proba(models, weights, X_val)
    ensemble_f1 = f1_score(y_val, (ensemble_preds > 0.5).astype(int))
    ensemble_prec = precision_score(y_val, (ensemble_preds > 0.5).astype(int))
    ensemble_rec = recall_score(y_val, (ensemble_preds > 0.5).astype(int))

    print(f"\nEnsemble performance:")
    print(f"  F1-score: {ensemble_f1:.4f}")
    print(f"  Precision: {ensemble_prec:.4f}")
    print(f"  Recall: {ensemble_rec:.4f}")

    # Анализ корреляции предсказаний
    if len(val_predictions) > 1:
        print("\nDiversity analysis:")
        pred_df = pd.DataFrame(val_predictions)
        corr_matrix = pred_df.corr()

        # Средняя корреляция (исключая диагональ)
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
        avg_corr = corr_matrix.where(mask).stack().mean()
        print(f"  Average correlation between models: {avg_corr:.3f}")

        # Стандартное отклонение предсказаний
        pred_std = pred_df.std(axis=1).mean()
        print(f"  Average std of predictions: {pred_std:.3f}")

        # Визуализация
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))

        # Корреляционная матрица
        sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm',
                    center=0.85, square=True, linewidths=1,
                    cbar_kws={"shrink": 0.8}, ax=axes[0])
        axes[0].set_title('Correlation Matrix of Model Predictions')

        # Распределение предсказаний каждой модели
        for name in val_predictions.keys():
            axes[1].hist(val_predictions[name], bins=30, alpha=0.5, label=name)
        axes[1].set_xlabel('Predicted Probability')
        axes[1].set_ylabel('Count')
        axes[1].set_title('Distribution of Predictions by Model')
        axes[1].legend(loc='best', fontsize=8)
        axes[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    return models, weights

def ensemble_predict_proba(models, weights, X):
    """Получает взвешенные предсказания ансамбля CatBoost"""
    predictions = np.zeros(len(X))

    for name, model in models.items():
        preds = model.predict_proba(X)[:, 1]
        predictions += weights[name] * preds

    return predictions

In [None]:
models, weights = train_catboost_ensemble(
    X_train, y_train, X_test, y_test,
    n_models=5,  # количество моделей в ансамбле
    use_weights=False  # взвешивание по F1-score
)

In [None]:
import joblib
for idx, model in enumerate(models.values(), start=1):
    joblib.dump(model, f"model{idx}.pkl")