# 신용카드 고객 이탈 예측 - 베이스라인 모델

이 노트북은 BankChurners.csv 데이터를 활용하여 간단한 이탈 예측(Churn Prediction) 베이스라인 모델을 구축합니다.

## 목차
1. 필요한 라이브러리 임포트
2. 데이터 불러오기 및 확인
3. 데이터 전처리
4. 학습/테스트 데이터 분리
5. 베이스라인 모델 학습
6. 모델 예측 및 평가

In [30]:
# 1. 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


In [31]:
# 2. 데이터 불러오기 및 확인
df = pd.read_csv('../data/raw/BankChurners.csv')
print('Shape:', df.shape)
df.head()

Shape: (10127, 23)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [None]:
# 3. 데이터 전처리 (간단)
# 타겟 컬럼: 'Attrition_Flag' (이탈: 'Attrited Customer', 유지: 'Existing Customer')
df = df.copy()
df['Attrition_Binary'] = (df['Attrition_Flag'] == 'Attrited Customer').astype(int)

# 불필요한 컬럼 제거 (고객번호 등)
drop_cols = [col for col in ['CLIENTNUM', 'Attrition_Flag', "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"] if col in df.columns]
df = df.drop(columns=drop_cols)

# 간단한 결측치 처리 (최빈값/평균 대체)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna(df[col].mode()[0])
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, drop_first=True)

In [33]:
df.shape


(10127, 20)

In [12]:
# 4. 학습/테스트 데이터 분리
X = df.drop('Attrition_Binary', axis=1)
y = df['Attrition_Binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (8101, 32) Test shape: (2026, 32)


In [13]:
# 5. 베이스라인 모델 학습 (로지스틱 회귀)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [1]:
# 6. 모델 예측 및 평가
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)
print(f"정확도: {acc:.4f}")
print(f"ROC-AUC: {roc:.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred))

NameError: name 'model' is not defined

## 간단한 결과 해석
- 위 결과는 로지스틱 회귀를 활용한 이탈 예측의 베이스라인입니다.
- 정확도(Accuracy)와 ROC-AUC, 분류 리포트를 참고해 모델의 기본 성능을 확인할 수 있습니다.
- 더 나은 성능을 위해 추가적인 전처리, 피처 엔지니어링, 다양한 모델 실험이 필요합니다.