In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


# 고객 이탈 예측 모델 개발 과정 (Machine Learning Pipeline)

```
이 문서는 이커머스 고객 이탈 예측 프로젝트의 전체 머신러닝 파이프라인을 정리한 문서입니다.  
모델 비교 및 선정, 하이퍼파라미터 튜닝, 최종 선택 모델 선정까지의 흐름을 코드와 함께 정리합니다.
```

In [4]:
## 1. 라이브러리 불러오기 (`from` / `import`)

import pandas as pd
import numpy as np

import joblib

# 데이터 분할 및 검증
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# 데이터 전처리
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 평가 지표
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# 분류 모델
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# 부스팅 모델
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [5]:
import warnings
warnings.filterwarnings("ignore", message="Could not find the number of physical cores")

2. 데이터 전처리

```
    결측치 처리 (mean, dropna 등)
    범주형 인코딩 (pd.get_dummies, LabelEncoder)
    클래스 불균형 처리 (SMOTE)
    피처 스케일링 (StandardScaler)
```

In [6]:
## 2. 데이터 전처리

# 1. 데이터 로드
file_path = 'E Commerce Dataset2.xlsx'
df = pd.read_excel(file_path)

df.drop(columns=['CustomerID'], inplace=True)

# 3. 결측치 처리 - 수치형 평균으로 대체
num_cols_to_impute = [
    'Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear',
    'CouponUsed', 'OrderCount', 'DaySinceLastOrder'
]
for col in num_cols_to_impute:
    df[col] = df[col].fillna(df[col].mean())

# 4. 이상치 제거 (IQR 기준)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('Churn')
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# 5. 타겟 분리 및 범주형 인코딩
y = df['Churn']
X = df.drop(columns=['Churn'])
cat_cols = X.select_dtypes(include='object').columns.tolist()
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 7. Train/Test 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 8. SMOTE 적용 (Train에만)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 9. 스케일링 (Logistic 등 안정화 목적)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


# 3. 주요 12개 머신러닝 모델 학습 및 비교

```
Logistic Regression
Decision Tree
Random Forest
Gradient Boosting
AdaBoost
Bagging
ExtraTrees
K-Nearest Neighbors
SVC
Naive Bayes
XGBoost
LightGBM
```

In [7]:
## 3. 주요 12개 머신러닝 모델 학습 및 비교

# 4. 모델 정의
models = {
    "LogisticRegression": LogisticRegression(max_iter=3000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Bagging": BaggingClassifier(n_estimators=100, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True, random_state=42),
    "GaussianNB": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
}

# ✅ 5. 모델 학습 및 평가
for name, model in models.items():
    print(f"\n================== {name} ==================")

    # Logistic, SVC 등 스케일링된 데이터 사용
    if name in ["LogisticRegression", "KNN", "SVC", "GaussianNB"]:
        model.fit(X_train_scaled, y_train_res)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred
    else:
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    train_acc = model.score(X_train_scaled if name in ["LogisticRegression", "KNN", "SVC", "GaussianNB"] else X_train_res, y_train_res)
    test_acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    delta = round(train_acc - test_acc, 4)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy:  {test_acc:.4f}")
    print(f"Precision:      {precision:.4f}")
    print(f"Recall:         {recall:.4f}")
    print(f"F1 Score:       {f1:.4f}")
    print(f"AUC:            {auc:.4f}")
    if delta > 0.1:
        print(f"⚠️ Train-Test 차이: {delta} → 과적합 가능")

    cv_f1 = cross_val_score(model,
                            X_train_scaled if name in ["LogisticRegression", "KNN", "SVC", "GaussianNB"] else X_train_res,
                            y_train_res, cv=5, scoring='f1')
    print(f"CV 평균 F1: {cv_f1.mean():.4f}, 표준편차: {cv_f1.std():.4f}")
    if cv_f1.std() > 0.05:
        print("⚠️ 교차검증 변동성 존재")

    print("\n[분류 리포트]")
    print(classification_report(y_test, y_pred))


Train Accuracy: 0.8579
Test Accuracy:  0.8293
Precision:      0.5503
Recall:         0.6739
F1 Score:       0.6059
AUC:            0.8548
CV 평균 F1: 0.8396, 표준편차: 0.0723
⚠️ 교차검증 변동성 존재

[분류 리포트]
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       571
           1       0.55      0.67      0.61       138

    accuracy                           0.83       709
   macro avg       0.73      0.77      0.75       709
weighted avg       0.85      0.83      0.84       709


Train Accuracy: 1.0000
Test Accuracy:  0.8829
Precision:      0.6730
Recall:         0.7754
F1 Score:       0.7205
AUC:            0.8421
⚠️ Train-Test 차이: 0.1171 → 과적합 가능
CV 평균 F1: 0.9031, 표준편차: 0.0313

[분류 리포트]
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       571
           1       0.67      0.78      0.72       138

    accuracy                           0.88       709
   macro avg       0.81      0.84      0.82   

  File "C:\Users\gobok\anaconda3\envs\gym-churn\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\gobok\anaconda3\envs\gym-churn\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\gobok\anaconda3\envs\gym-churn\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\gobok\anaconda3\envs\gym-churn\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Number of positive: 1827, number of negative: 1826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2080
[LightGBM] [Info] Number of data points in the train set: 3653, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500137 -> initscore=0.000547
[LightGBM] [Info] Start training from score 0.000547
[LightGBM] [Info] Number of positive: 1827, number of negative: 1826
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2073
[LightGBM] [Info] Number of data points in the train set: 3653, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500137 -> initscore=0.000547
[LightGBM] [Info] Start training from score 0.000547
[LightGBM] [Info] Numb

In [8]:
# 하이퍼파라미터 - GridSearchCV( XGBoost )

# 1. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2. XGBoost 분류기 정의
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# 3. 파라미터 후보
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 4. GridSearchCV로 튜닝
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# 5. 최적 모델 저장
best_model = grid.best_estimator_
joblib.dump(best_model, "xgb_best_model.pkl")

# 6. 평가 (Train/Test)
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1]

# 7. 성능 출력
print("🎯 Best Parameters:", grid.best_params_)
print("✅ Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("✅ Test Accuracy :", accuracy_score(y_test, y_test_pred))
print("✅ Precision     :", precision_score(y_test, y_test_pred))
print("✅ Recall        :", recall_score(y_test, y_test_pred))
print("✅ F1 Score      :", f1_score(y_test, y_test_pred))
print("✅ AUC Score     :", roc_auc_score(y_test, y_test_proba))

# 8. 과적합 판단
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)
print("⚠️ 과적합 여부 (Train - Test F1):", round(train_f1 - test_f1, 4))

# 9. 상세 분류 리포트
print("\n[분류 리포트]\n", classification_report(y_test, y_test_pred))




Fitting 5 folds for each of 72 candidates, totalling 360 fits
🎯 Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
✅ Train Accuracy: 1.0
✅ Test Accuracy : 0.9746121297602257
✅ Precision     : 0.9838709677419355
✅ Recall        : 0.8840579710144928
✅ F1 Score      : 0.9312977099236642
✅ AUC Score     : 0.9846188989568264
⚠️ 과적합 여부 (Train - Test F1): 0.0687

[분류 리포트]
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       571
           1       0.98      0.88      0.93       138

    accuracy                           0.97       709
   macro avg       0.98      0.94      0.96       709
weighted avg       0.97      0.97      0.97       709



In [None]:
# 모델 정의
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

# 학습 (SMOTE 적용된 데이터)
xgb_model.fit(X_train_res, y_train_res)

# 예측
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:, 1]

# 평가 지표
train_acc = xgb_model.score(X_train_res, y_train_res)
test_acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
delta = round(train_acc - test_acc, 4)

# 출력
print("✅ XGBoost 평가 결과")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"AUC Score     : {auc:.4f}")
print(f"⚠️ 과적합 여부 (Train - Test F1): {round(f1_score(y_train_res, xgb_model.predict(X_train_res)) - f1, 4)}")

# 교차검증
cv_f1 = cross_val_score(xgb_model, X_train_res, y_train_res, cv=5, scoring='f1')
print(f"CV 평균 F1: {cv_f1.mean():.4f}, 표준편차: {cv_f1.std():.4f}")
if cv_f1.std() > 0.05:
    print("⚠️ 교차검증 변동성 존재")

# 분류 리포트
print("\n[분류 리포트]")
print(classification_report(y_test, y_pred))

# 모델 저장 경로 지정
save_path = "./models"
os.makedirs(save_path, exist_ok=True)

# 모델 저장
joblib.dump(best_model, os.path.join(save_path, "xgboost_best_model.pkl"))