In [1]:
from pathlib import Path
import os, json
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay, PrecisionRecallDisplay
)

from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.inspection import permutation_importance
import joblib

RANDOM_STATE = 42
DATA_PATH = Path("S06-hw-dataset-04.csv")

ARTIFACTS_DIR = Path("artifacts")
FIG_DIR = ARTIFACTS_DIR / "figures"
ARTIFACTS_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
# Я загружаю датасет
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
display(df.head())

# Я отделяю признаки и таргет и исключаю id из признаков
X = df.drop(columns=["id", "target"])
y = df["target"].astype(int)

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())
print("y share:\n", y.value_counts(normalize=True).round(4))


Shape: (25000, 62)


Unnamed: 0,id,f01,f02,f03,f04,f05,f06,f07,f08,f09,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
0,1,-1.25021,1.423474,-0.225004,-4.023138,-0.832729,-0.550874,1.77209,2.76169,-0.69875,...,10.938269,0.501178,1.600001,0.314212,1.209735,1.355697,-5.338924,1.153944,-0.153934,0
1,2,0.074328,0.376429,0.212831,-0.502074,2.017405,0.625496,1.943785,1.24203,-0.52409,...,7.775262,-4.550195,6.272586,-0.932162,-0.228543,1.73522,-3.827828,0.292165,0.27372,0
2,3,0.638481,0.060968,0.74676,2.479653,-0.292858,-0.078139,-2.918423,-0.013186,1.009135,...,-4.448447,-9.593179,-3.093519,0.029321,0.605511,0.829103,-0.085985,2.891408,0.766221,0
3,4,1.712916,-1.350969,-0.256473,1.622074,-0.445141,0.911932,-3.440345,1.505192,-1.104348,...,-1.619072,-3.237479,-5.474038,-1.582475,0.198137,3.823409,0.880395,1.14861,0.136732,0
4,5,0.905676,-0.206545,-0.068806,4.086026,-1.010045,-0.772644,-4.207688,2.506104,1.589143,...,-2.396844,-10.540129,-5.532811,-1.231203,0.000119,4.298572,-1.558235,0.924673,0.111668,0


X shape: (25000, 60)
y distribution:
 target
0    23770
1     1230
Name: count, dtype: int64
y share:
 target
0    0.9508
1    0.0492
Name: proportion, dtype: float64


In [3]:
# Я делаю train/test split со stratify (из-за дисбаланса классов это важно)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Train target share:\n", y_train.value_counts(normalize=True).round(4))
print("Test  target share:\n", y_test.value_counts(normalize=True).round(4))


Train: (17500, 60) Test: (7500, 60)
Train target share:
 target
0    0.9508
1    0.0492
Name: proportion, dtype: float64
Test  target share:
 target
0    0.9508
1    0.0492
Name: proportion, dtype: float64


In [4]:
# 5. Масштабирование числовых признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
def evaluate_binary(model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)

    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)

    proba_tr = model.predict_proba(X_tr)[:, 1]
    proba_te = model.predict_proba(X_te)[:, 1]

    out = {
        "acc_train": accuracy_score(y_tr, pred_tr),
        "f1_train": f1_score(y_tr, pred_tr),              # binary F1 по классу 1
        "roc_auc_train": roc_auc_score(y_tr, proba_tr),
        "acc_test": accuracy_score(y_te, pred_te),
        "f1_test": f1_score(y_te, pred_te),
        "roc_auc_test": roc_auc_score(y_te, proba_te),
        "y_pred_test": pred_te,
        "y_proba_test": proba_te
    }
    return out

def save_cm(y_true, y_pred, title, path):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(values_format="d")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path, dpi=120)
    plt.show()

def save_roc_pr(y_true, y_proba, prefix):
    RocCurveDisplay.from_predictions(y_true, y_proba)
    plt.title(f"ROC — {prefix}")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{prefix}_roc.png", dpi=120)
    plt.show()

    PrecisionRecallDisplay.from_predictions(y_true, y_proba)
    plt.title(f"PR — {prefix}")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{prefix}_pr.png", dpi=120)
    plt.show()


In [6]:
results = []

# Dummy (most_frequent) — “наивный” baseline под дисбаланс
dummy_mf = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
r = evaluate_binary(dummy_mf, X_train, y_train, X_test, y_test)
results.append(("DummyMostFrequent", r))
print("baseline DummyMostFrequent:", {k:v for k,v in r.items() if k.endswith("test")})

# Dummy (stratified)
dummy_st = DummyClassifier(strategy="stratified", random_state=RANDOM_STATE)
r = evaluate_binary(dummy_st, X_train, y_train, X_test, y_test)
results.append(("DummyStratified", r))
print("baseline DummyStratified:", {k:v for k,v in r.items() if k.endswith("test")})

# LogisticRegression baseline (из S05) — со scaler
logreg = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000))
])

r = evaluate_binary(logreg, X_train, y_train, X_test, y_test)
results.append(("LogReg", r))
print("baseline LogReg:", {k:v for k,v in r.items() if k.endswith("test")})


baseline DummyMostFrequent: {'acc_test': 0.9508, 'f1_test': 0.0, 'roc_auc_test': 0.5, 'y_pred_test': array([0, 0, 0, ..., 0, 0, 0], shape=(7500,)), 'y_proba_test': array([0., 0., 0., ..., 0., 0., 0.], shape=(7500,))}
baseline DummyStratified: {'acc_test': 0.9073333333333333, 'f1_test': 0.05184174624829468, 'roc_auc_test': 0.5015551017941815, 'y_pred_test': array([0, 0, 0, ..., 0, 0, 0], shape=(7500,)), 'y_proba_test': array([0., 0., 0., ..., 0., 0., 0.], shape=(7500,))}
baseline LogReg: {'acc_test': 0.962, 'f1_test': 0.39490445859872614, 'roc_auc_test': 0.8380223908816006, 'y_pred_test': array([0, 0, 0, ..., 0, 0, 0], shape=(7500,)), 'y_proba_test': array([0.01214669, 0.1460244 , 0.05760279, ..., 0.23148538, 0.00718406,
       0.16705708], shape=(7500,))}


In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

search_summaries = {}

def run_search(name, estimator, param_grid):
    search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring="roc_auc",
        cv=cv,
        n_jobs=-1
    )
    search.fit(X_train, y_train)

    search_summaries[name] = {
        "best_params": search.best_params_,
        "best_cv_score_roc_auc": float(search.best_score_),
        "cv": "StratifiedKFold(n_splits=5, shuffle=True, random_state=42)",
        "scoring": "roc_auc"
    }

    return search.best_estimator_

# DecisionTree
tree_best = run_search(
    "DecisionTree",
    DecisionTreeClassifier(random_state=RANDOM_STATE),
    {
        "max_depth": [3, 5, 7, 10, None],
        "min_samples_leaf": [1, 5, 10, 20, 50],
        "class_weight": [None, "balanced"]
    }
)

r = evaluate_binary(tree_best, X_train, y_train, X_test, y_test)
results.append(("DecisionTree", r))


In [None]:
# RandomForest
rf_best = run_search(
    "RandomForest",
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    {
        "n_estimators": [200, 500],
        "max_depth": [None, 10, 20],
        "min_samples_leaf": [1, 5, 10],
        "class_weight": [None, "balanced"]
    }
)

r = evaluate_binary(rf_best, X_train, y_train, X_test, y_test)
results.append(("RandomForest", r))


In [None]:
# Boosting (HistGradientBoosting)
hgb_best = run_search(
    "HistGradientBoosting",
    HistGradientBoostingClassifier(random_state=RANDOM_STATE),
    {
        "max_depth": [3, 5, None],
        "learning_rate": [0.03, 0.05, 0.1],
        "max_iter": [200, 500]
    }
)

r = evaluate_binary(hgb_best, X_train, y_train, X_test, y_test)
results.append(("HistGradientBoosting", r))

print("Я закончил подбор гиперпараметров на train через CV.")
display(search_summaries)

In [None]:
rows = []
for name, r in results:
    rows.append({
        "model": name,
        "acc_test": r["acc_test"],
        "f1_test": r["f1_test"],
        "roc_auc_test": r["roc_auc_test"]
    })

results_df = pd.DataFrame(rows).sort_values("roc_auc_test", ascending=False)
display(results_df)

best_model_name = results_df.iloc[0]["model"]
print("Я выбрал лучшую модель по ROC-AUC:", best_model_name)


In [None]:
best_map = {
    "DummyMostFrequent": dummy_mf,
    "DummyStratified": dummy_st,
    "LogReg": logreg,
    "DecisionTree": tree_best,
    "RandomForest": rf_best,
    "HistGradientBoosting": hgb_best
}

best_model = best_map[best_model_name]
best_res = dict(results)[best_model_name]  # берём сохранённые предикты/вероятности

save_cm(y_test, best_res["y_pred_test"], f"Confusion matrix — {best_model_name}", FIG_DIR / "best_confusion_matrix.png")
save_roc_pr(y_test, best_res["y_proba_test"], f"best_{best_model_name}")


In [None]:
# Я интерпретирую лучшую модель через permutation importance
perm = permutation_importance(
    best_model, X_test, y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring="roc_auc"
)

imp = pd.DataFrame({
    "feature": X.columns,
    "importance_mean": perm.importances_mean
}).sort_values("importance_mean", ascending=False)

top_n = 15
display(imp.head(top_n))

plt.figure(figsize=(8, 5))
plt.barh(imp.head(top_n)["feature"][::-1], imp.head(top_n)["importance_mean"][::-1])
plt.title(f"Permutation importance (top-{top_n}) — {best_model_name}")
plt.tight_layout()
plt.savefig(FIG_DIR / "best_permutation_importance.png", dpi=120)
plt.show()


In [None]:
# 1) metrics_test.csv
results_df.to_csv(ARTIFACTS_DIR / "metrics_test.csv", index=False)

# 2) search_summaries.json
with open(ARTIFACTS_DIR / "search_summaries.json", "w", encoding="utf-8") as f:
    json.dump(search_summaries, f, ensure_ascii=False, indent=2)

# 3) best_model.joblib
joblib.dump(best_model, ARTIFACTS_DIR / "best_model.joblib")

# 4) best_model_meta.json
meta = {
    "dataset": str(DATA_PATH.name),
    "random_state": RANDOM_STATE,
    "test_size": 0.30,
    "stratify": True,
    "best_model_name": best_model_name,
    "best_model_params": getattr(best_model, "get_params", lambda: {})()
}
with open(ARTIFACTS_DIR / "best_model_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Я сохранил артефакты в папку artifacts/.")


In [None]:
# Я формирую report.md по шаблону (заголовки не меняю)
dataset_rows = df.shape[0]
dataset_cols = df.shape[1]

t_counts = y.value_counts()
t_share = y.value_counts(normalize=True)

report = f"""# HW06 – Report

> Файл: `homeworks/HW06/report.md`

## 1. Dataset

- Какой датасет выбран: `{DATA_PATH.name}`
- Размер: ({dataset_rows}, {dataset_cols})
- Целевая переменная: `target` (0: {t_counts[0]} / {t_share[0]:.4f}, 1: {t_counts[1]} / {t_share[1]:.4f})
- Признаки: 60 числовых (`f01`…`f60`)

## 2. Protocol

- Разбиение: train/test = 70/30, `random_state={RANDOM_STATE}`, `stratify=y`
- Подбор: GridSearchCV + StratifiedKFold(5) на train, оптимизировала ROC-AUC
- Метрики: accuracy, F1 (для положительного класса 1), ROC-AUC (важно при дисбалансе)

## 3. Models

- DummyClassifier (most_frequent и stratified)
- LogisticRegression (pipeline со StandardScaler)
- DecisionTreeClassifier (max_depth, min_samples_leaf, class_weight)
- RandomForestClassifier (n_estimators, max_depth, min_samples_leaf, class_weight)
- HistGradientBoostingClassifier (max_depth, learning_rate, max_iter)

## 4. Results

{results_df.to_markdown(index=False)}

Победитель по ROC-AUC: **{best_model_name}**

## 5. Analysis

- Устойчивость: (сюда добавлю 5 прогонов с разными random_state для 1–2 моделей)
- Ошибки: confusion matrix сохранена в `artifacts/figures/best_confusion_matrix.png`
- Интерпретация: permutation importance сохранена в `artifacts/figures/best_permutation_importance.png`

## 6. Conclusion

- (3–6 тезисов)
"""

Path("report.md").write_text(report, encoding="utf-8")
print("Я записал report.md (проверьте и допишите раздел 5–6).")
