# Stroke Prediction – Modeling
EDA 전처리를 기반으로 **stroke(0/1)** 분류 모델을 학습한다.  
클래스 불균형을 고려해 `class_weight='balanced'`를 사용하고, **ROC-AUC/PR-AUC·Recall** 중심으로 평가한다.  
필요 시 임곗값(threshold)을 조정해 **Recall 목표**를 만족시킨다.

### 주요 단계
1. 데이터 로드 & 학습/검증 데이터 분리
2. 전처리 파이프라인 구성 (One-Hot Encoding + Scaling)
3. 베이스라인 모델 (Logistic Regression)
4. 성능 평가 (ROC-AUC, Recall 중심)
5. 시각화 (혼동행렬, ROC/PR Curve)
6. 추가 모델 비교 (RandomForest, XGBoost)

In [3]:
# 라이브러리
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    precision_recall_curve, roc_curve, confusion_matrix, average_precision_score
)

# 경로/폴더
DATA_PATH = "../data/stroke_clean.csv"   # 01_eda.ipynb에서 저장한 파일
FIG_DIR   = "../figures"
os.makedirs(FIG_DIR, exist_ok=True)

# 1) 데이터 로드 (클린 CSV 필수)
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"클린 데이터가 없습니다: {DATA_PATH} (01_eda.ipynb에서 저장하세요)")
df = pd.read_csv(DATA_PATH)

# 2) 피처/타깃 및 컬럼 정의
target = 'stroke'
num_cols = ['age', 'avg_glucose_level', 'bmi', 'hypertension', 'heart_disease']  # 이진 포함
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

X = df.drop(columns=[target])
y = df[target].astype(int)

# 3) 학습/검증 분리(계층화)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) 전처리 파이프라인
numeric_tf = Pipeline(steps=[('scaler', StandardScaler())])
categorical_tf = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_tf, num_cols),
        ('cat', categorical_tf, cat_cols)
    ]
)

# 5) 모델: 로지스틱(불균형 가중치)
clf = Pipeline(steps=[
    ('prep', preprocess),
    ('logit', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
clf.fit(X_train, y_train)

# 6) 기본 임곗값(0.5) 성능
y_pred  = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
roc  = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print(f"[thr=0.5] Acc={acc:.3f}  Prec={prec:.3f}  Rec={rec:.3f}  F1={f1:.3f}  ROC-AUC={roc:.3f}  PR-AUC={pr_auc:.3f}")

# ROC 곡선 저장
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC-AUC={roc:.3f}')
plt.plot([0,1],[0,1],'--',linewidth=1)
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic (balanced)')
plt.legend(loc='lower right'); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/roc_logit.png", bbox_inches='tight', dpi=150)
plt.close()

# PR 곡선 저장
prec_arr, rec_arr, thr_arr = precision_recall_curve(y_test, y_proba)
plt.figure()
plt.plot(rec_arr, prec_arr, label=f'PR-AUC={pr_auc:.3f}')
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Logistic (balanced)')
plt.legend(loc='lower left'); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/pr_logit.png", bbox_inches='tight', dpi=150)
plt.close()

# 혼동행렬 저장
def save_confmat(y_true, y_hat, title, outfile):
    cm = confusion_matrix(y_true, y_hat)
    plt.figure()
    plt.imshow(cm, cmap='Blues')
    plt.title(title); plt.xlabel('Predicted'); plt.ylabel('Actual')
    for (i,j), v in np.ndenumerate(cm):
        plt.text(j, i, f"{v}", ha='center', va='center')
    plt.tight_layout()
    plt.savefig(outfile, bbox_inches='tight', dpi=150)
    plt.close()

save_confmat(y_test, y_pred, "Confusion Matrix (thr=0.5)", f"{FIG_DIR}/confmat_logit_default.png")

# 7) 임곗값 튜닝 — 목표 Recall 맞추기
target_recall = 0.70
best_thr = 0.5
idx = np.argmax(rec_arr >= target_recall)
if idx > 0 and idx <= len(thr_arr):
    best_thr = thr_arr[idx-1]

y_pred_tuned = (y_proba >= best_thr).astype(int)
acc_t  = accuracy_score(y_test, y_pred_tuned)
prec_t = precision_score(y_test, y_pred_tuned, zero_division=0)
rec_t  = recall_score(y_test, y_pred_tuned)
f1_t   = f1_score(y_test, y_pred_tuned)

print(f"[tuned ≥{target_recall:.2f}] thr={best_thr:.3f} | Acc={acc_t:.3f}  Prec={prec_t:.3f}  Rec={rec_t:.3f}  F1={f1_t:.3f}")

save_confmat(y_test, y_pred_tuned, f"Confusion Matrix (thr={best_thr:.3f})", f"{FIG_DIR}/confmat_logit_tuned.png")

# 8) 요약파일 저장
summary = pd.DataFrame([
    {"Model":"Logistic (balanced)", "Threshold":0.50,     "ROC-AUC":roc, "PR-AUC":pr_auc,
     "Recall":rec, "Precision":prec, "F1":f1, "Accuracy":acc},
    {"Model":"Logistic (balanced)", "Threshold":best_thr, "ROC-AUC":roc, "PR-AUC":pr_auc,
     "Recall":rec_t, "Precision":prec_t, "F1":f1_t, "Accuracy":acc_t}
]).round(3)
summary.to_csv(f"{FIG_DIR}/metrics_logit.csv", index=False)

print("\nSaved:")
print(f"- ROC: {FIG_DIR}/roc_logit.png")
print(f"- PR : {FIG_DIR}/pr_logit.png")
print(f"- CM : {FIG_DIR}/confmat_logit_default.png, {FIG_DIR}/confmat_logit_tuned.png")
print(f"- CSV: {FIG_DIR}/metrics_logit.csv")


[0.5] Acc=0.738  Prec=0.134  Rec=0.800  F1=0.230  ROC-AUC=0.839  PR-AUC=0.259
[tuned ≥0.70] thr=0.500 | Acc=0.738  Prec=0.134  Rec=0.800  F1=0.230

Saved:
- ROC: ../figures/roc_logit.png
- PR : ../figures/pr_logit.png
- CM : ../figures/confmat_logit_default.png, ../figures/confmat_logit_tuned.png
- CSV: ../figures/metrics_logit.csv


## 모델링 결과 요약 — Logistic Regression (class_weight=balanced)

### 데이터/전처리
- BMI 결측값 중앙값 대체, `gender='Other'` 제거  
- 범주형 원-핫, 수치형 표준화  
- 분할: Stratified 80/20

### 지표 (실행 결과)
- **Accuracy**: 0.738  
- **Precision**: 0.134  
- **Recall**: 0.800  
- **F1-score**: 0.230  
- **ROC-AUC**: 0.839  
- **PR-AUC**: 0.259  

### 산출물
- 혼동행렬: `../figures/confmat_logit_default.png`, `../figures/confmat_logit_tuned.png`  
- ROC 곡선: `../figures/roc_logit.png`  
- PR 곡선: `../figures/pr_logit.png`  
- 지표 CSV: `../figures/metrics_logit.csv`

### 해석
- 불균형 가중치로 **재현율(Recall) 0.80** 확보(소수 클래스 미탐지 감소).  
- **정밀도(Precision) 0.13**로 오탐이 많음 → RF/XGBoost, SMOTE 등으로 개선 여지.  
- ROC-AUC 0.839로 분별력 양호.
