## 0. 사전 준비(모듈 import 및 데이터 불러오기)

In [2]:
%pip install xgboost --quiet

import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# XGBoost 사용 가능하면 켜기
USE_XGB = True
try:
    from xgboost import XGBClassifier
except Exception:
    USE_XGB = False

# ---------------------------
# CSV 로딩 유틸 (./data 폴더)
# ---------------------------
def load_csv_from_data(filename: str | None = None, data_dir: str = "data") -> pd.DataFrame:
    """
    프로젝트 루트(노트북 실행 경로 기준)의 ./data 폴더에서 CSV를 읽어옵니다.
    - filename이 None이면 data 폴더 내 첫 번째 .csv를 자동 탐색합니다.
    """
    data_path = Path.cwd() / data_dir
    assert data_path.exists(), f"[ERROR] 폴더가 없습니다: {data_path}"

    if filename is None:
        csv_files = sorted(data_path.glob("*.csv"))
        assert len(csv_files) > 0, f"[ERROR] {data_path}에 .csv 파일이 없습니다."
        file_path = csv_files[0]
    else:
        file_path = data_path / filename
        assert file_path.exists(), f"[ERROR] 파일이 없습니다: {file_path}"

    # 인코딩/구분자 이슈에 대비한 안전 로딩
    try:
        df = pd.read_csv(file_path)
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding="utf-8-sig")
    except Exception as e:
        # 콤마 이외 구분자일 가능성
        try:
            df = pd.read_csv(file_path, sep=";")
        except Exception:
            raise e

    print(f"[OK] Loaded: {file_path}  shape={df.shape}")
    return df

# 파일 자동 탐색(첫 번째 CSV):
df = load_csv_from_data()

# 확인
display(df.head())
print(df.columns.tolist())


Note: you may need to restart the kernel to use updated packages.
[OK] Loaded: c:\Users\Playdata\Desktop\개별 프로젝트들\dan2\workspace\SKN18-2nd-1Team\1-analytics\data\Customer-Churn-Records.csv  shape=(10000, 18)


c:\Users\Playdata\Desktop\개별 프로젝트들\dan2\workspace\SKN18-2nd-1Team\.venv\Scripts\python.exe: No module named pip


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain', 'Satisfaction Score', 'Card Type', 'Point Earned']


## 1. EDA 기반 피쳐 설정

In [3]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    data = df.copy()

    # 컬럼 명에 공백/대문자 등 정리 (선택)
    # 여기서는 원본 이름 유지. 접근 시 대괄호 인덱싱 사용: data['Satisfaction Score']

    # Age Group
    data['Age_Group'] = pd.cut(
        data['Age'],
        bins=[18, 30, 40, 50, 100],
        labels=['18-30', '31-40', '41-50', '51+'],
        include_lowest=True
    )

    # Senior Flag
    data['Senior_Flag'] = (data['Age'] >= 45).astype(int)

    # 독일 플래그 + 독일&고잔액 상호작용
    data['Germany_Flag'] = (data['Geography'] == 'Germany').astype(int)
    median_balance = data['Balance'].median()
    data['Germany_HighBalance'] = ((data['Geography'] == 'Germany') & (data['Balance'] > median_balance)).astype(int)

    # 잔액/상품수 (참여도 지표)
    data['Balance_per_Product'] = data['Balance'] / (data['NumOfProducts'].replace(0, np.nan))
    data['Balance_per_Product'] = data['Balance_per_Product'].fillna(0.0)

    # 비활동 & 상품1개 (고위험)
    data['LowActive_LowProduct'] = ((data['IsActiveMember'] == 0) & (data['NumOfProducts'] == 1)).astype(int)

    # 불만 & 활동 여부 결합
    # data['Complain_Active'] = data['Complain'].astype(str) + "_" + data['IsActiveMember'].astype(str)

    # 만족도 구간 (Low/Medium/High)
    data['Satisfaction_Level'] = pd.cut(
        data['Satisfaction Score'],
        bins=[0, 2, 4, 5],
        labels=['Low', 'Medium', 'High'],
        include_lowest=True
    )

    #complain 제거
    data = data.drop(columns=['Complain'])

    return data


## 2. 피처 세트 정의 & 전처리 파이프라인

In [4]:
def get_feature_groups() -> Dict[str, List[str]]:
    raw_numeric = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts']
    raw_binary  = ['IsActiveMember']  # 0/1

    engineered_numeric = ['Balance_per_Product']
    engineered_binary  = ['Senior_Flag', 'Germany_Flag', 'Germany_HighBalance', 'LowActive_LowProduct']

    onehot_cats = ['Geography', 'Age_Group', 'Satisfaction_Level']

    all_numeric = raw_numeric + engineered_numeric
    all_binary  = raw_binary + engineered_binary
    return {
        'numeric': all_numeric,
        'binary':  all_binary,
        'onehot':  onehot_cats
    }

def make_preprocessor(feature_groups: Dict[str, List[str]]) -> ColumnTransformer:
    numeric = feature_groups['numeric']
    binary  = feature_groups['binary']
    onehot  = feature_groups['onehot']

    transformers = []
    if numeric:
        transformers.append(("num", Pipeline(steps=[
            ("scaler", StandardScaler())
        ]), numeric))
    if binary:
        # 이진은 스케일링 불필요 → 패스스루
        transformers.append(("bin", "passthrough", binary))
    if onehot:
        transformers.append(("oh", OneHotEncoder(handle_unknown='ignore', drop=None), onehot))

    preproc = ColumnTransformer(transformers=transformers, remainder='drop', sparse_threshold=0.3)
    return preproc


## 3. 모델 후보군들
- 로지스틱 회귀
- 랜덤포레스트
- XGBoost

In [5]:
def get_model_zoo(class_weight_balanced=True) -> Dict[str, object]:
    models = {}

    # 로지스틱 회귀 (강한 베이스라인, 해석 용이)
    models['logreg'] = LogisticRegression(
        max_iter=1000,
        class_weight='balanced' if class_weight_balanced else None,
        random_state=RANDOM_STATE,
        n_jobs=None
    )

    # 랜덤포레스트 (비선형/상호작용 포착)
    models['rf'] = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced' if class_weight_balanced else None,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    # XGBoost (있으면 추가)
    if USE_XGB:
        models['xgb'] = XGBClassifier(
            n_estimators=400,
            max_depth=4,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            objective='binary:logistic',
            eval_metric='auc',
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
    return models
    print("step 3 ended")


## 4. 교차검증 평가

In [6]:
SCORING = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "precision": "precision",
    "recall": "recall",
    "average_precision": "average_precision"  # PR-AUC
}

def evaluate_models_cv(X: pd.DataFrame, y: pd.Series,
                       preproc: ColumnTransformer,
                       models: Dict[str, object],
                       cv=CV, scoring=SCORING) -> pd.DataFrame:
    rows = []
    for name, clf in models.items():
        pipe = Pipeline(steps=[("preprocess", preproc), ("model", clf)])
        cv_res = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
        row = {"model": name}
        for key in scoring.keys():
            row[f"mean_{key}"] = np.mean(cv_res[f"test_{key}"])
            row[f"std_{key}"]  = np.std(cv_res[f"test_{key}"])
        rows.append(row)

    report = pd.DataFrame(rows).sort_values("mean_roc_auc", ascending=False).reset_index(drop=True)
    return report


## 5. 최종 학습 & 결과 리포트

In [7]:
def train_final_and_report(X, y, preproc, clf, test_size=0.2, random_state=42):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score,
        roc_auc_score, average_precision_score, confusion_matrix, classification_report
    )

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    pipe = Pipeline(steps=[("preprocess", preproc), ("model", clf)])
    pipe.fit(X_tr, y_tr)

    # 예측
    y_pred = pipe.predict(X_te)
    y_proba = None
    if hasattr(pipe.named_steps['model'], "predict_proba"):
        y_proba = pipe.predict_proba(X_te)[:, 1]
    elif hasattr(pipe.named_steps['model'], "decision_function"):
        # 확률이 없으면 decision score로 대체 (AUC 계산에는 점수도 OK)
        y_proba = pipe.decision_function(X_te)

    # 지표
    metrics = {
        "Accuracy": accuracy_score(y_te, y_pred),
        "Precision": precision_score(y_te, y_pred),
        "Recall": recall_score(y_te, y_pred),
        "F1": f1_score(y_te, y_pred),
    }
    if y_proba is not None:
        metrics["ROC_AUC"] = roc_auc_score(y_te, y_proba)
        metrics["PR_AUC"]  = average_precision_score(y_te, y_proba)

    print("\n[TEST METRICS]")
    for k, v in metrics.items():
        print(f"{k:>10}: {v:.4f}")

    print("\n[CONFUSION MATRIX]")
    print(confusion_matrix(y_te, y_pred))

    print("\n[CLASSIFICATION REPORT]")
    print(classification_report(y_te, y_pred, digits=4))

    return pipe



## 6. 실제 작동 파트

In [8]:
# 1) 피처 엔지니어링
data_fe = engineer_features(df)

# 2) 타깃/입력 분리
y = data_fe['Exited'].astype(int)

# 사용 컬럼 그룹 정의
fg = get_feature_groups()
use_cols = fg['numeric'] + fg['binary'] + fg['onehot']

X = data_fe[use_cols].copy()

# 3) 전처리
preproc = make_preprocessor(fg)

# 4) 모델 후보군 & CV 평가
models = get_model_zoo(class_weight_balanced=True)
cv_report = evaluate_models_cv(X, y, preproc, models, cv=CV)
print(cv_report)

# 5) 최고 AUC 모델 선정 후 최종 학습/리포트
best_name = cv_report.loc[0, 'model']
best_model = models[best_name]
final_pipe = train_final_and_report(X, y, preproc, best_model, test_size=0.2)


    model  mean_roc_auc  std_roc_auc   mean_f1    std_f1  mean_precision  \
0     xgb      0.861067     0.010362  0.589432  0.014393        0.750672   
1      rf      0.839562     0.007619  0.566805  0.013964        0.739924   
2  logreg      0.791242     0.008582  0.496446  0.007632        0.385136   

   std_precision  mean_recall  std_recall  mean_average_precision  \
0       0.022181     0.485292    0.011730                0.700921   
1       0.026589     0.459755    0.015329                0.656982   
2       0.001898     0.698737    0.025726                0.564875   

   std_average_precision  
0               0.013794  
1               0.016661  
2               0.010821  

[TEST METRICS]
  Accuracy: 0.8690
 Precision: 0.7607
    Recall: 0.5221
        F1: 0.6192
   ROC_AUC: 0.8723
    PR_AUC: 0.7230

[CONFUSION MATRIX]
[[1525   67]
 [ 195  213]]

[CLASSIFICATION REPORT]
              precision    recall  f1-score   support

           0     0.8866    0.9579    0.9209      1592