6. Ансамбли
    1. Усреднение
    2. Voting
    3. Stacking через линейную регрессию и Ridge

In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import config

In [34]:
df = pd.read_csv(config.CONFIG['paths']['train_with_folds_fe'])
TARGET_COL = config.CONFIG['validation']['target_column']
N_SPLITS = config.CONFIG['validation']['n_splits']

feature_cols = [c for c in df.columns if c not in [TARGET_COL, 'fold'] and pd.api.types.is_numeric_dtype(df[c])]
print('Фичи', len(feature_cols), 'Строки', len(df))

Фичи 14 Строки 891


# Предсказания

In [35]:
def run_cv_oof(model_class, model_params, df, target_col=TARGET_COL, n_splits=N_SPLITS):
    oof_preds = np.zeros(len(df))
    oof_proba = np.zeros(len(df))
    scores = []
    for fold in range(n_splits):
        train_mask = df['fold'] != fold
        val_mask = df['fold'] == fold
        X_train = df.loc[train_mask, feature_cols]
        y_train = df.loc[train_mask, target_col]
        X_val = df.loc[val_mask, feature_cols]
        y_val = df.loc[val_mask, target_col]

        model = model_class(**model_params)
        model.fit(X_train, y_train)

        oof_preds[val_mask] = model.predict(X_val)
        oof_proba[val_mask] = model.predict_proba(X_val)[:, 1]
        scores.append(accuracy_score(y_val, oof_preds[val_mask]))

    return oof_preds, oof_proba, scores

In [36]:
params_lr = {'max_iter': 3000, 'random_state': 42}
params_rf = {'n_estimators': 200, 'max_depth': 5, 'random_state': 42}
params_xgb = {'n_estimators': 200, 'max_depth': 4, 'random_state': 42, 'eval_metric': 'logloss'}
params_cat = {'iterations': 200, 'depth': 4, 'verbose': 0, 'random_state': 42}

oof_probas = {}
scores_per_model = {}

oof_preds_lr, oof_proba_lr, scores_lr = run_cv_oof(LogisticRegression, params_lr, df)
oof_probas['lr'] = oof_proba_lr
scores_per_model['LogReg'] = (np.mean(scores_lr), np.std(scores_lr))

oof_preds_rf, oof_proba_rf, scores_rf = run_cv_oof(RandomForestClassifier, params_rf, df)
oof_probas['rf'] = oof_proba_rf
scores_per_model['RF'] = (np.mean(scores_rf), np.std(scores_rf))

oof_preds_xgb, oof_proba_xgb, scores_xgb = run_cv_oof(XGBClassifier, params_xgb, df)
oof_probas['xgb'] = oof_proba_xgb
scores_per_model['XGB'] = (np.mean(scores_xgb), np.std(scores_xgb))

oof_preds_cat, oof_proba_cat, scores_cat = run_cv_oof(CatBoostClassifier, params_cat, df)
oof_probas['cat'] = oof_proba_cat
scores_per_model['CatBoost'] = (np.mean(scores_cat), np.std(scores_cat))

for name, (m, s) in scores_per_model.items():
    print(f'{name}: {m:.4f} +- {s:.4f}')

LogReg: 0.8081 +- 0.0114
RF: 0.8317 +- 0.0122
XGB: 0.8024 +- 0.0325
CatBoost: 0.8339 +- 0.0150


# Усреднение

In [37]:
proba_avg = (oof_proba_lr + oof_proba_rf + oof_proba_xgb + oof_proba_cat) / 4
pred_avg = (proba_avg >= 0.5).astype(int)
acc_avg = accuracy_score(df[TARGET_COL], pred_avg)
print('Усреднение OOF accuracy:', round(acc_avg, 4))
scores_per_model['Averaging'] = (acc_avg, 0.0)

Усреднение OOF accuracy: 0.8305


# Voting

In [38]:
estimators_voting = [
    ('lr', LogisticRegression(**params_lr)),
    ('rf', RandomForestClassifier(**params_rf)),
    ('xgb', XGBClassifier(**params_xgb)),
    ('cat', CatBoostClassifier(**params_cat)),
]
voting = VotingClassifier(estimators=estimators_voting, voting='soft')

_, _, scores_vote = run_cv_oof(VotingClassifier, {'estimators': estimators_voting, 'voting': 'soft'}, df)
scores_per_model['Voting'] = (np.mean(scores_vote), np.std(scores_vote))
print('Voting OOF mean accuracy:', round(np.mean(scores_vote), 4), '+-', round(np.std(scores_vote), 4))

Voting OOF mean accuracy: 0.8305 +- 0.0241


In [43]:
results = pd.DataFrame([
    {'method': name, 'mean_acc': m, 'std_acc': s}
    for name, (m, s) in scores_per_model.items()
]).sort_values('mean_acc', ascending=False)
print(results.to_string(index=False))

ensemble_path = config.CONFIG['paths'].get('ensemble_results', config.CONFIG['paths']['checkpoint_dir'] / 'ensemble_results.csv')
results.rename(columns={'method': 'model', 'mean_acc': 'mean_accuracy', 'std_acc': 'std_accuracy'}).to_csv(ensemble_path, index=False)
print('Сохранено:', ensemble_path)

   method  mean_acc  std_acc
 CatBoost  0.833890 0.014952
       RF  0.831655 0.012225
Averaging  0.830527 0.000000
   Voting  0.830494 0.024134
   LogReg  0.808085 0.011380
      XGB  0.802410 0.032515
Сохранено: C:\newTry2\classicMLpractice\ProjectKaggle\checkpoints\ensemble_results.csv
