## 0. Настройки и зависимости


In [3]:
import json
import os
from pathlib import Path
import time
import warnings

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_curve,
    auc,
    average_precision_score,
    precision_recall_curve,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings("ignore")
RANDOM_STATE = 42
HW_DIR = Path('.')
ARTIFACTS_DIR = HW_DIR / 'artifacts'
FIG_DIR = ARTIFACTS_DIR / 'figures'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

## 1. Загрузка датасета

In [4]:
df = pd.read_csv("S06-hw-dataset-02.csv")
print('Shape:', df.shape)
print('\nHead:')
print(df.head().to_string(index=False))

print('\nInfo:')
print(df.info())

print('\nDescribe:')
print(df.describe().T)

if 'target' not in df.columns:
    raise ValueError('CSV не содержит столбца target')

print('\nTarget distribution:')
print(df['target'].value_counts(dropna=False))
print(df['target'].value_counts(normalize=True))

Shape: (18000, 39)

Head:
 id       f01       f02       f03        f04       f05       f06       f07       f08       f09       f10       f11       f12       f13        f14       f15       f16       f17       f18        f19       f20       f21       f22       f23       f24      f25      f26      f27       f28       f29      f30       f31       f32       f33       f34        f35   x_int_1   x_int_2  target
  1 -0.149235 -2.826966 -0.522901  -4.198449  1.364943  0.815043 -1.195518 -1.932232  2.396353  1.121683 -0.332250  0.303750  2.439315   3.905690 -0.679945 -1.847890 -1.450850 -0.523963  -2.203766  1.717017 -0.467238 -5.418752  5.115531  0.951900 0.085200 0.304588 0.206599  0.293322 -0.159323 0.448015  0.572745  0.149916  0.878392 -0.679733   1.412751  0.421883  9.217167       1
  2 -1.966180 -4.877542  0.268367  -9.607791  0.097149  1.347185 -3.872575 -0.395117  1.710068 -0.298809  0.555178  3.632876  1.551984   2.671995 -4.859814 -3.454798 -0.238638  0.604069  -3.080758  0.489968 -0.

In [5]:
## 3. Подготовка X, y

In [6]:
# Убираем id-колонку
if 'id' in df.columns:
    df = df.drop(columns=['id'])

X = df.drop(columns=['target']).copy()
y = df['target'].copy()

# Приводим нечисловые признаки к числовым, если можно
non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
print('Non-numeric cols:', non_numeric)
if non_numeric:
    for c in non_numeric:
        try:
            X[c] = pd.to_numeric(X[c])
        except Exception:
            print(f"Column {c} is non-numeric and cannot be directly converted; consider encoding.")

# Проверка пропусков
print('\nMissing summary:')
print(X.isna().sum().sort_values(ascending=False).head(10))

Non-numeric cols: []

Missing summary:
f01    0
f02    0
f03    0
f04    0
f05    0
f06    0
f07    0
f08    0
f09    0
f10    0
dtype: int64


## 4. Train/test split

In [7]:
TEST_SIZE = 0.25
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (13500, 37) Test shape: (4500, 37)


## 5. Baselines: Dummy + LogisticRegression

In [8]:
results = {}
search_summaries = {}

# Dummy baseline
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)
try:
    y_proba_dummy = dummy.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_dummy = None

metrics_dummy = {
    'accuracy': float(accuracy_score(y_test, y_pred_dummy)),
}
if y_proba_dummy is not None and len(np.unique(y_test)) == 2:
    metrics_dummy['roc_auc'] = float(roc_auc_score(y_test, y_proba_dummy))
else:
    metrics_dummy['roc_auc'] = None

results['Dummy'] = metrics_dummy
print('Dummy metrics:', metrics_dummy)

# LogisticRegression baseline
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
])
param_grid_lr = {'lr__C': [0.01, 0.1, 1.0, 10.0]}
scoring = 'roc_auc' if len(np.unique(y)) == 2 else 'f1_macro'

gs_lr = GridSearchCV(pipe_lr, param_grid_lr, scoring=scoring, cv=5, n_jobs=-1)
gs_lr.fit(X_train, y_train)

print('LR best params:', gs_lr.best_params_, 'best score:', gs_lr.best_score_)

best_lr = gs_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)
try:
    y_proba_lr = best_lr.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_lr = None

metrics_lr = {
    'accuracy': float(accuracy_score(y_test, y_pred_lr)),
}
if len(np.unique(y)) == 2 and y_proba_lr is not None:
    metrics_lr['roc_auc'] = float(roc_auc_score(y_test, y_proba_lr))
else:
    metrics_lr['roc_auc'] = None

# f1
metrics_lr['f1'] = float(f1_score(y_test, y_pred_lr, average='binary' if len(np.unique(y))==2 else 'macro'))
results['LogisticRegression'] = metrics_lr
search_summaries['LogisticRegression'] = {'best_params': gs_lr.best_params_, 'cv_score': gs_lr.best_score_}
print('LogReg metrics:', metrics_lr)

Dummy metrics: {'accuracy': 0.7373333333333333, 'roc_auc': 0.5}
LR best params: {'lr__C': 1.0} best score: 0.803353171075198
LogReg metrics: {'accuracy': 0.8162222222222222, 'roc_auc': 0.8008904412072182, 'f1': 0.5717244950802693}


## 6. Week-6 models: DecisionTree, RandomForest, GradientBoosting

In [9]:
# 6.1 Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 8, None],
    'min_samples_leaf': [1, 5, 10],
    # 'ccp_alpha': [0.0, 0.001, 0.01]
}
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
gs_dt = GridSearchCV(dt, param_grid_dt, scoring=scoring, cv=5, n_jobs=-1)
gs_dt.fit(X_train, y_train)
print('DT best:', gs_dt.best_params_, gs_dt.best_score_)
best_dt = gs_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
try:
    y_proba_dt = best_dt.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_dt = None

metrics_dt = {'accuracy': float(accuracy_score(y_test, y_pred_dt))}
metrics_dt['f1'] = float(f1_score(y_test, y_pred_dt, average='binary' if len(np.unique(y))==2 else 'macro'))
metrics_dt['roc_auc'] = float(roc_auc_score(y_test, y_proba_dt)) if (y_proba_dt is not None and len(np.unique(y))==2) else None
results['DecisionTree'] = metrics_dt
search_summaries['DecisionTree'] = {'best_params': gs_dt.best_params_, 'cv_score': gs_dt.best_score_}
print('DecisionTree metrics:', metrics_dt)

# 6.2 Random Forest
param_grid_rf = {
    'n_estimators': [100],
    'max_depth': [5, 8, None],
    'max_features': ['sqrt', 0.5],
    'min_samples_leaf': [1, 5]
}
rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
gs_rf = GridSearchCV(rf, param_grid_rf, scoring=scoring, cv=4, n_jobs=-1)
gs_rf.fit(X_train, y_train)
print('RF best:', gs_rf.best_params_, gs_rf.best_score_)
best_rf = gs_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
try:
    y_proba_rf = best_rf.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_rf = None

metrics_rf = {'accuracy': float(accuracy_score(y_test, y_pred_rf))}
metrics_rf['f1'] = float(f1_score(y_test, y_pred_rf, average='binary' if len(np.unique(y))==2 else 'macro'))
metrics_rf['roc_auc'] = float(roc_auc_score(y_test, y_proba_rf)) if (y_proba_rf is not None and len(np.unique(y))==2) else None
results['RandomForest'] = metrics_rf
search_summaries['RandomForest'] = {'best_params': gs_rf.best_params_, 'cv_score': gs_rf.best_score_}
print('RandomForest metrics:', metrics_rf)

# 6.3 Gradient Boosting (sklearn)
param_grid_gb = {
    'n_estimators': [100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
gs_gb = GridSearchCV(gb, param_grid_gb, scoring=scoring, cv=4, n_jobs=-1)
gs_gb.fit(X_train, y_train)
print('GB best:', gs_gb.best_params_, gs_gb.best_score_)
best_gb = gs_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)
try:
    y_proba_gb = best_gb.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_gb = None

metrics_gb = {'accuracy': float(accuracy_score(y_test, y_pred_gb))}
metrics_gb['f1'] = float(f1_score(y_test, y_pred_gb, average='binary' if len(np.unique(y))==2 else 'macro'))
metrics_gb['roc_auc'] = float(roc_auc_score(y_test, y_proba_gb)) if (y_proba_gb is not None and len(np.unique(y))==2) else None
results['GradientBoosting'] = metrics_gb
search_summaries['GradientBoosting'] = {'best_params': gs_gb.best_params_, 'cv_score': gs_gb.best_score_}
print('GradientBoosting metrics:', metrics_gb)

DT best: {'max_depth': 8, 'min_samples_leaf': 10} 0.8295161796490411
DecisionTree metrics: {'accuracy': 0.826, 'f1': 0.6244604316546762, 'roc_auc': 0.829829780441809}
RF best: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100} 0.9232905214065784
RandomForest metrics: {'accuracy': 0.8893333333333333, 'f1': 0.75199203187251, 'roc_auc': 0.9266073175184528}
GB best: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100} 0.92187810312377
GradientBoosting metrics: {'accuracy': 0.8957777777777778, 'f1': 0.7784600850259802, 'roc_auc': 0.9221071752396046}


## 7. Stacking — собираем несколько сильных моделей

In [10]:
try:
    estimators = [
        ('rf', best_rf),
        ('gb', best_gb),
        ('dt', best_dt),
    ]
    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1)
    stack.fit(X_train, y_train)
    y_pred_stack = stack.predict(X_test)
    try:
        y_proba_stack = stack.predict_proba(X_test)[:, 1]
    except Exception:
        y_proba_stack = None
    metrics_stack = {'accuracy': float(accuracy_score(y_test, y_pred_stack))}
    metrics_stack['f1'] = float(f1_score(y_test, y_pred_stack, average='binary' if len(np.unique(y))==2 else 'macro'))
    metrics_stack['roc_auc'] = float(roc_auc_score(y_test, y_proba_stack)) if (y_proba_stack is not None and len(np.unique(y))==2) else None
    results['Stacking'] = metrics_stack
    print('Stacking metrics:', metrics_stack)
except Exception as exc:
    print('Stacking failed or skipped:', exc)

Stacking metrics: {'accuracy': 0.9064444444444445, 'f1': 0.8101037437979252, 'roc_auc': 0.927999253418517}


## 8. Сводная таблица результатов и выбор лучшей модели

In [11]:
from sklearn.model_selection import cross_val_score

# Сформируем DataFrame результатов
results_df = pd.DataFrame([{'model': k, **v} for k, v in results.items()]).set_index('model')
print('\nResults (test metrics):')
print(results_df)

# Выбираем метрику для отбора (консистентно с тем, что использовалось в CV)
if len(np.unique(y)) == 2:
    metric_for_selection = 'roc_auc'
else:
    metric_for_selection = 'f1'

# Соберём CV-оценки на train для всех кандидатов
model_cv_scores = {}

# Dummy: cross_val_score на train
try:
    model_cv_scores['Dummy'] = float(cross_val_score(dummy, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1).mean())
except Exception:
    model_cv_scores['Dummy'] = None

# Для моделей, где мы использовали GridSearchCV
model_cv_scores['LogisticRegression'] = float(gs_lr.best_score_) if 'gs_lr' in locals() else None
model_cv_scores['DecisionTree'] = float(gs_dt.best_score_) if 'gs_dt' in locals() else None
model_cv_scores['RandomForest'] = float(gs_rf.best_score_) if 'gs_rf' in locals() else None
model_cv_scores['GradientBoosting'] = float(gs_gb.best_score_) if 'gs_gb' in locals() else None

# Stacking
if 'gs_rf' in locals() and 'gs_gb' in locals() and 'gs_dt' in locals():
    try:
        # Создаём новые экземляры с лучшими параметрами
        rf_cv = RandomForestClassifier(**gs_rf.best_params_, random_state=RANDOM_STATE, n_jobs=-1)
    except Exception:
        rf_cv = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
    try:
        gb_cv = GradientBoostingClassifier(**gs_gb.best_params_, random_state=RANDOM_STATE)
    except Exception:
        gb_cv = GradientBoostingClassifier(random_state=RANDOM_STATE)
    try:
        dt_cv = DecisionTreeClassifier(**gs_dt.best_params_, random_state=RANDOM_STATE)
    except Exception:
        dt_cv = DecisionTreeClassifier(random_state=RANDOM_STATE)

    try:
        stack_cv_cl = StackingClassifier(
            estimators=[('rf', rf_cv), ('gb', gb_cv), ('dt', dt_cv)],
            final_estimator=LogisticRegression(max_iter=2000),
            n_jobs=-1,
            cv=5  # StackingClassifier умеет использовать CV для построения мета-признаков
        )
        model_cv_scores['Stacking'] = float(cross_val_score(stack_cv_cl, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1).mean())
    except Exception:
        model_cv_scores['Stacking'] = None
else:
    model_cv_scores['Stacking'] = None

print('\nCV scores on train (used for model selection):')
for k, v in model_cv_scores.items():
    print(f'  {k}: {v}')

# Выбираем модель с наилучшим CV-score
valid_scores = {k: v for k, v in model_cv_scores.items() if v is not None}
if not valid_scores:
    raise RuntimeError("Не удалось получить CV-оценки для выбора лучшей модели — проверьте доступность gs_* или cross_val_score.")
best_model_name = max(valid_scores, key=valid_scores.get)
print(f"\nSelected best model by CV on train: {best_model_name}")

# Получаем объект модели для дальнейшей оценки на тесте.
# Если это модель из GridSearch — используем её best_estimator_, иначе создаём/обучаем подходящую модель.
model_map = {
    'LogisticRegression': gs_lr.best_estimator_ if 'gs_lr' in locals() else None,
    'DecisionTree': gs_dt.best_estimator_ if 'gs_dt' in locals() else None,
    'RandomForest': gs_rf.best_estimator_ if 'gs_rf' in locals() else None,
    'GradientBoosting': gs_gb.best_estimator_ if 'gs_gb' in locals() else None,
}
if best_model_name == 'Dummy':
    best_model = dummy
    best_model.fit(X_train, y_train)
elif best_model_name == 'Stacking':
    # берем ранее созданный stack_cv_cl (unfitted) и обучаем на train
    stack_cv_cl.fit(X_train, y_train)
    best_model = stack_cv_cl
else:
    best_model = model_map.get(best_model_name)
    if best_model is None:
        raise RuntimeError(f"Best model {best_model_name} not available in model_map.")
    # best_model у best_estimator_ для GridSearch уже обучен на X_train
    # но можно заново вызвать fit для гарантии
    # best_model.fit(X_train, y_train)

print(f"Final model object used for test evaluation: {best_model}")

y_pred_best = best_model.predict(X_test)
try:
    y_proba_best = best_model.predict_proba(X_test)[:, 1]
except Exception:
    y_proba_best = None

metrics_best = {
    'accuracy': float(accuracy_score(y_test, y_pred_best)),
    'f1': float(f1_score(y_test, y_pred_best, average='binary' if len(np.unique(y))==2 else 'macro')),
}
if len(np.unique(y)) == 2 and y_proba_best is not None:
    metrics_best['roc_auc'] = float(roc_auc_score(y_test, y_proba_best))
else:
    metrics_best['roc_auc'] = None

# Обновим results и meta для отчёта (только для лучшей модели - тестовые метрики)
results[best_model_name] = metrics_best
best_meta = {
    'best_model_name': best_model_name,
    'selection_metric': metric_for_selection,
    'cv_score': valid_scores[best_model_name],
    'test_metrics': metrics_best,
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
}
print('Selected best model test metrics:', metrics_best)



Results (test metrics):
                    accuracy   roc_auc        f1
model                                           
Dummy               0.737333  0.500000       NaN
LogisticRegression  0.816222  0.800890  0.571724
DecisionTree        0.826000  0.829830  0.624460
RandomForest        0.889333  0.926607  0.751992
GradientBoosting    0.895778  0.922107  0.778460
Stacking            0.906444  0.927999  0.810104

CV scores on train (used for model selection):
  Dummy: 0.5
  LogisticRegression: 0.803353171075198
  DecisionTree: 0.8295161796490411
  RandomForest: 0.9232905214065784
  GradientBoosting: 0.92187810312377
  Stacking: 0.9274067577724585

Selected best model by CV on train: Stacking
Final model object used for test evaluation: StackingClassifier(cv=5,
                   estimators=[('rf',
                                RandomForestClassifier(n_jobs=-1,
                                                       random_state=42)),
                               ('gb',
            

## 9. Permutation importance для лучшей модели

In [12]:
if best_model is not None:
    print('Computing permutation importance for best model:', best_model_name)
    try:
        r = permutation_importance(best_model, X_test, y_test, n_repeats=30, random_state=RANDOM_STATE, n_jobs=-1)
        imp_df = pd.DataFrame({
            'feature': X.columns,
            'importance_mean': r.importances_mean,
            'importance_std': r.importances_std
        }).sort_values('importance_mean', ascending=False).reset_index(drop=True)
        print('\nTop features (permutation importance):')
        print(imp_df.head(15).to_string(index=False))
        imp_df.to_csv(ARTIFACTS_DIR / 'feature_importance.csv', index=False)
        plt.figure(figsize=(8,6))
        plt.barh(imp_df['feature'].head(15)[::-1], imp_df['importance_mean'].head(15)[::-1])
        plt.xlabel('Permutation importance (mean)')
        plt.title(f'Permutation importance (best model: {best_model_name})')
        plt.tight_layout()
        plt.savefig(FIG_DIR / 'permutation_importance.png', dpi=150)
        plt.close()
    except Exception as exc:
        print('Permutation importance failed:', exc)
else:
    print('No best model available for permutation importance')

if 'best_rf' in locals() and best_rf is not None:
    try:
        fi = best_rf.feature_importances_
        fi_df = pd.DataFrame({'feature': X.columns, 'importance': fi}).sort_values('importance', ascending=False).reset_index(drop=True)
        fi_df.to_csv(ARTIFACTS_DIR / 'rf_feature_importance.csv', index=False)
        plt.figure(figsize=(8,6))
        plt.barh(fi_df['feature'].head(15)[::-1], fi_df['importance'].head(15)[::-1])
        plt.xlabel('Feature importance (RF)')
        plt.title('Random Forest feature importances (top 15)')
        plt.tight_layout()
        plt.savefig(FIG_DIR / 'rf_feature_importance.png', dpi=150)
        plt.close()
    except Exception as exc:
        print('RF feature importance failed:', exc)


Computing permutation importance for best model: Stacking

Top features (permutation importance):
feature  importance_mean  importance_std
    f16         0.084711        0.004335
    f01         0.037459        0.003220
    f23         0.018526        0.001976
    f07         0.014637        0.002024
    f30         0.013985        0.002195
    f08         0.013889        0.001631
    f12         0.013726        0.001882
    f19         0.013459        0.001677
    f15         0.011993        0.001470
    f29         0.011874        0.002427
    f18         0.011422        0.001966
    f02         0.010156        0.002085
    f13         0.009452        0.001814
    f05         0.009348        0.002151
    f34         0.009119        0.001498


## 10. Сохранение артефактов: метрики, summary поиска, модель, метаданные

In [13]:
# metrics_test.json
with open(ARTIFACTS_DIR / 'metrics_test.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# search_summaries.json
with open(ARTIFACTS_DIR / 'search_summaries.json', 'w', encoding='utf-8') as f:
    json.dump(search_summaries, f, ensure_ascii=False, indent=2)

# best_model.joblib + metadata
if best_model is not None:
    joblib.dump(best_model, ARTIFACTS_DIR / 'best_model.joblib')
    best_meta = {
        'best_model_name': best_model_name,
        'selection_metric': metric_for_selection,
        'test_metrics': results.get(best_model_name, {}),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    }
    with open(ARTIFACTS_DIR / 'best_model_meta.json', 'w', encoding='utf-8') as f:
        json.dump(best_meta, f, ensure_ascii=False, indent=2)

print('Artifacts written to', ARTIFACTS_DIR)

Artifacts written to artifacts


## 11. ROC / PR и confusion matrix сохранение

In [14]:
if len(np.unique(y)) == 2:
    # plot ROC for best and for others
    plt.figure(figsize=(6,6))
    for name, model in [('LogReg', best_lr), ('DT', best_dt), ('RF', best_rf), ('GB', best_gb)]:
        try:
            proba = model.predict_proba(X_test)[:,1]
            fpr, tpr, _ = roc_curve(y_test, proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")
        except Exception:
            pass
    # Include stacking curve if available
    if 'stack' in locals():
        try:
            proba = stack.predict_proba(X_test)[:,1]
            fpr, tpr, _ = roc_curve(y_test, proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"Stacking (AUC={roc_auc:.3f})", linewidth=2, linestyle='--')
        except Exception:
            pass

    plt.plot([0,1],[0,1], linestyle=':', color='grey')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curves')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'roc_curves_all.png', dpi=150)
    plt.close()

    # PR curve for best model
    try:
        proba_best = (best_model.predict_proba(X_test)[:,1]) if best_model is not None else None
        if proba_best is not None:
            precision, recall, _ = precision_recall_curve(y_test, proba_best)
            ap = average_precision_score(y_test, proba_best)
            plt.figure(figsize=(6,6))
            plt.plot(recall, precision, label=f'Best (AP={ap:.3f})')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision-Recall curve')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(FIG_DIR / 'pr_curve_best.png', dpi=150)
            plt.close()
    except Exception:
        pass

    # confusion matrix for best model
    try:
        if best_model is not None:
            y_pred_best = best_model.predict(X_test)
            cm = confusion_matrix(y_test, y_pred_best)
            plt.figure(figsize=(4,4))
            plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
            plt.title('Confusion matrix (best)')
            plt.colorbar()
            tick_marks = np.arange(len(np.unique(y)))
            plt.xticks(tick_marks, tick_marks)
            plt.yticks(tick_marks, tick_marks)
            plt.xlabel('Predicted')
            plt.ylabel('True')
            for i in range(cm.shape[0]):
                for j in range(cm.shape[1]):
                    plt.text(j, i, cm[i,j], ha='center', va='center', color='white' if cm[i,j]>cm.max()/2 else 'black')
            plt.tight_layout()
            plt.savefig(FIG_DIR / 'confusion_matrix_best.png', dpi=150)
            plt.close()
    except Exception:
        pass

print('Figures saved to', FIG_DIR)

Figures saved to artifacts\figures


## 12. Генерация краткого отчёта report.md (в HW06/report.md)

In [16]:
print("\nGenerating report.md ...")
report_lines = []
report_lines.append("# HW06 – Report\n")
# 1. Dataset
report_lines.append("## 1. Dataset\n")
report_lines.append(f"- Какой датасет выбран: `S06-hw-dataset-02.csv`\n")
report_lines.append(f"- Размер: {df.shape[0]} строк, {X.shape[1]} признаков\n")
report_lines.append("- Целевая переменная: `target`\n")
for val, cnt in df["target"].value_counts().items():
    pct = df["target"].value_counts(normalize=True)[val]
    report_lines.append(f"  - класс {val}: {cnt} ({pct:.2%})\n")
report_lines.append("- Признаки: все числовые (при необходимости некоторые были приведены к числовому типу).\n\n")

# 2. Protocol
report_lines.append("## 2. Protocol\n")
report_lines.append(f"- Разбиение: train/test = {1-TEST_SIZE:.2f}/{TEST_SIZE:.2f}, random_state={RANDOM_STATE}\n")
report_lines.append("- Подбор: GridSearchCV на train (см. search_summaries.json); CV-оценки использованы для выбора модели, тест использован один раз для финальной оценки.\n")
report_lines.append(f"- Метрики: accuracy, F1, ROC-AUC (для бинарных). Для CV использовался scoring='{scoring}'.\n\n")

# 3. Models
report_lines.append("## 3. Models\n")
report_lines.append("- DummyClassifier (most_frequent) — baseline\n")
report_lines.append("- LogisticRegression (Pipeline: StandardScaler + LogisticRegression), C подобран через GridSearchCV\n")
report_lines.append("- DecisionTreeClassifier — подбор max_depth + min_samples_leaf через GridSearchCV\n")
report_lines.append("- RandomForestClassifier — подбор max_depth/max_features/min_samples_leaf через GridSearchCV\n")
report_lines.append("- GradientBoostingClassifier — подбор learning_rate/max_depth через GridSearchCV\n")
report_lines.append("- StackingClassifier (опционально) — обучён на train, оценён на test\n\n")

# 4. Results
report_lines.append("## 4. Results\n")
report_lines.append("Таблица финальных метрик на test:\n\n")
# add markdown table
try:
    md_table = results_df.to_markdown()
    report_lines.append(md_table + "\n\n")
except Exception:
    report_lines.append(str(results_df) + "\n\n")

report_lines.append(f"- Победитель (по CV, затем по тестовой метрике): **{best_model_name}** \n\n")

# 5. Analysis
report_lines.append("## 5. Analysis\n")
report_lines.append("- Устойчивость: при желании запустите несколько прогонов с разными random_state (опционально, не включено в код по умолчанию).\n")
report_lines.append("- Ошибки: confusion matrix сохранена в artifacts/figures/confusion_matrix_best.png (если best_model поддерживает predict).\n")
report_lines.append("- Интерпретация: permutation importance сохранён в artifacts/feature_importance.csv и artifacts/figures/permutation_importance.png (если был успешно рассчитан).\n\n")

# 6. Conclusion
report_lines.append("## 6. Conclusion\n")
report_lines.append("- Ансамбли (RF/GB/Stacking) обычно дают лучший баланс bias/variance по сравнению с одиночными деревьями или линейными моделями на этих данных.\n")
report_lines.append("- Для честного эксперимента подбор гиперпараметров выполняйте только на train, используйте CV, и тест применяйте один раз для финальной оценки.\n")
report_lines.append("- Сохраняйте модели и метаданные (best_model.joblib, best_model_meta.json) для воспроизводимости.\n")
report_lines.append("- Для дисбалансных задач дополнительно смотрите PR-кривую и average_precision_score.\n")

report_path = HW_DIR / "report.md"
with open(report_path, "w", encoding="utf-8") as f:
    f.writelines(report_lines)
print("Wrote report to", report_path)


Generating report.md ...
Wrote report to report.md
