## 필요 라이브러리 설치

In [None]:
!sudo apt-get install -y fonts-nanum

!sudo fc-cache -fv

!rm ~/.cache/matplotlib -rf

In [None]:
#코랩의 경우 실행
!pip install catboost

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from collections import Counter
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')

In [None]:
#warning 무시
import warnings
warnings.filterwarnings('ignore')

## 데이터 읽어오기

In [None]:
df = pd.read_csv("/content/cell2celltrain.csv")

In [None]:
df

## 데이터 전처리

In [None]:
df.select_dtypes(include='object')

### 범주형 컬럼 확인

In [None]:
df.select_dtypes(include='object').columns

### 결측치 및 미사용 컬럼 제거

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df.drop(columns=['CustomerID'], inplace=True)

In [None]:
df

In [None]:
num_cols = df.select_dtypes(include='number').columns.tolist()

In [None]:
num_cols

### 범주형 컬럼 인코딩

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ✅ 1. 제거할 컬럼

# ✅ 2. 이진 인코딩: Yes/No → 1/0
binary_cols = [
    'Churn', 'ChildrenInHH', 'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner',
    'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings',
    'NonUSTravel', 'OwnsComputer', 'HasCreditCard', 'NewCellphoneUser', 'NotNewCellphoneUser',
    'OwnsMotorcycle', 'MadeCallToRetentionTeam'
]

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# ✅ 2-1. Homeownership: Known → 1, Unknown → 0
df['Homeownership'] = df['Homeownership'].map({'Known': 1, 'Unknown': 0})

# ✅ 3. 라벨 인코딩: 범주형 문자형 컬럼
label_cols = ['ServiceArea', 'PrizmCode', 'Occupation']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col].astype(str))  # astype(str)로 오류 방지

# "Unknown" → 0 으로 바꾸고, 전체를 float으로 변환
df['HandsetPrice'] = df['HandsetPrice'].replace('Unknown', 0).astype(float)

# ✅ 5. CreditRating: 숫자+문자 형태에서 문자 제거 (예: '3A' → 3)
df['CreditRating'] = df['CreditRating'].str.split('-').str[0].astype(int)

# ✅ 6. MaritalStatus: 'Unknown'이면 AgeHH2 기준으로 처리
def process_marital(row):
    if row['MaritalStatus'] == 'Unknown':
        if row['AgeHH2'] != 0:
            return 'Yes'
        else:
            return -1  # 결혼 여부 미상 처리
    else:
        return row['MaritalStatus']

df['MaritalStatus'] = df.apply(process_marital, axis=1)
df['MaritalStatus'] = df['MaritalStatus'].map({'Yes': 1, 'No': 0, -1: -1})

# ✅ 7. Null 처리: 수치형 결측치 그대로 두고 이후 스케일링 시 처리
# 필요시 df = df.dropna() 또는 fillna 처리

# ✅ 전처리 완료


### 타겟 라벨 확보

In [None]:
X = df.drop(columns=['Churn'])
y = df['Churn']

### 학습 / 테스트 모델 분할

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### 데이터 스케일링


In [None]:
from sklearn.preprocessing import RobustScaler

# 스케일링
scaler = RobustScaler()
X_train_scaled_num = pd.DataFrame(
    scaler.fit_transform(X_train[num_cols]),
    columns=num_cols,
    index=X_train.index
)

X_test_scaled_num = pd.DataFrame(
    scaler.transform(X_test[num_cols]),
    columns=num_cols,
    index=X_test.index
)

# 나머지 컬럼 (수치형 제외한 object 등)
X_train_rest = X_train.drop(columns=num_cols)
X_test_rest = X_test.drop(columns=num_cols)

# 다시 합치기
X_train = pd.concat([X_train_scaled_num, X_train_rest], axis=1)
X_test = pd.concat([X_test_scaled_num, X_test_rest], axis=1)

## 머신 러닝 모델 학습

In [None]:
# 클래스 가중치 계산
counter = Counter(y_train)
num_pos = counter[1]
num_neg = counter[0]
total = num_pos + num_neg
weight_for_0 = total / (2 * num_neg)
weight_for_1 = total / (2 * num_pos)
class_weights = {0: weight_for_0, 1: weight_for_1}

# 평가 저장
metrics_dict = {
    'F1-Score': {},
    'Accuracy': {},
    'Recall': {}
}

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    preds_train = model.predict(X_train)
    metrics_dict['F1-Score'][f'{name}_train'] = f1_score(y_train, preds_train, average='macro')
    metrics_dict['Accuracy'][f'{name}_train'] = accuracy_score(y_train, preds_train)
    metrics_dict['Recall'][f'{name}_train'] = recall_score(y_train, preds_train, average='macro')

    preds_test = model.predict(X_test)
    metrics_dict['F1-Score'][f'{name}_test'] = f1_score(y_test, preds_test, average='macro')
    metrics_dict['Accuracy'][f'{name}_test'] = accuracy_score(y_test, preds_test)
    metrics_dict['Recall'][f'{name}_test'] = recall_score(y_test, preds_test, average='macro')

# LightGBM
model_lgb = lgb.LGBMClassifier(random_state=42, verbosity=-1, class_weight='balanced')
model_lgb.fit(X_train, y_train)
evaluate_model('LightGBM', model_lgb, X_train, y_train, X_test, y_test)

# CatBoost
model_cat = CatBoostClassifier(
    verbose=0,
    task_type='CPU',
    random_state=42,
    class_weights=[weight_for_0, weight_for_1]
)
model_cat.fit(X_train, y_train)
evaluate_model('CatBoost', model_cat, X_train, y_train, X_test, y_test)

# Logistic Regression
model_lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
model_lr.fit(X_train, y_train)
evaluate_model('Logistic', model_lr, X_train, y_train, X_test, y_test)

# 결과 정리
metrics_df = pd.DataFrame(metrics_dict)
metrics_df = metrics_df.T.sort_index()
metrics_df


Unnamed: 0,LightGBM_train,LightGBM_test,XGBoost_train,XGBoost_test,CatBoost_train,CatBoost_test,RandomForest_train,RandomForest_test,Logistic_train,Logistic_test
Accuracy,0.698877,0.629685,0.724278,0.631494,0.757971,0.643754,0.56079,0.552909,0.589005,0.589087
F1-Score,0.674962,0.602057,0.700277,0.600104,0.734307,0.609225,0.549444,0.542257,0.558532,0.559277
Recall,0.713211,0.632631,0.738153,0.626121,0.771295,0.632188,0.603121,0.59671,0.583244,0.584745


## 소프트 보팅으로 앙상블 하여 결과 예측

In [None]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np
import pandas as pd

# 1️⃣ 각 모델의 이름과 클래스 1 확률 저장 (Train/Test)
model_prob_dict_train = {
    'LGBM': model_lgb.predict_proba(X_train)[:, 1],
    'XGBoost': model_xgb.predict_proba(X_train)[:, 1],
    'CatBoost': model_cat.predict_proba(X_train)[:, 1],
    'RF': model_rf.predict_proba(X_train)[:, 1],
    'Logistic': model_lr.predict_proba(X_train)[:, 1],
    # 'TabNet': model_tabnet.predict_proba(X_train.values)[:, 1]
}

model_prob_dict_test = {
    'LGBM': model_lgb.predict_proba(X_test)[:, 1],
    'XGBoost': model_xgb.predict_proba(X_test)[:, 1],
    'CatBoost': model_cat.predict_proba(X_test)[:, 1],
    'RF': model_rf.predict_proba(X_test)[:, 1],
    'Logistic': model_lr.predict_proba(X_test)[:, 1],
    # 'TabNet': model_tabnet.predict_proba(X_test.values)[:, 1]
}

# 2️⃣ 결과 저장용 리스트
results = []

# 3️⃣ 모든 조합 (2개 이상)
for r in range(2, len(model_prob_dict_test) + 1):
    for combo in itertools.combinations(model_prob_dict_test.keys(), r):
        # 평균 확률 계산 (Train/Test)
        avg_proba_train = np.mean([model_prob_dict_train[m] for m in combo], axis=0)
        avg_proba_test = np.mean([model_prob_dict_test[m] for m in combo], axis=0)

        final_preds_train = (avg_proba_train > 0.5).astype(int)
        final_preds_test = (avg_proba_test > 0.5).astype(int)

        # 성능 평가
        acc_train = accuracy_score(y_train, final_preds_train)
        f1_train = f1_score(y_train, final_preds_train)
        recall_train = recall_score(y_train, final_preds_train, average='macro')

        acc_test = accuracy_score(y_test, final_preds_test)
        f1_test = f1_score(y_test, final_preds_test)
        recall_test = recall_score(y_test, final_preds_test, average='macro')

        results.append({
            'Models': '+'.join(combo),
            'Train_Accuracy': acc_train,
            'Train_F1': f1_train,
            'Train_Recall': recall_train,
            'Test_Accuracy': acc_test,
            'Test_F1': f1_test,
            'Test_Recall': recall_test
        })

# 4️⃣ 결과 DataFrame으로 정리
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Test_F1', ascending=False).reset_index(drop=True)

In [None]:
results_df.sort_values(by='Test_Recall', ascending=False).reset_index(drop=True)

Unnamed: 0,Models,Train_Accuracy,Train_F1,Train_Recall,Test_Accuracy,Test_F1,Test_Recall
0,LGBM+CatBoost+Logistic,0.713198,0.601974,0.72645,0.63883,0.502905,0.638617
1,LGBM+CatBoost+RF+Logistic,0.696239,0.586893,0.713412,0.633102,0.503873,0.638388
2,LGBM+XGBoost+CatBoost+RF,0.721188,0.616061,0.739167,0.635916,0.50254,0.637837
3,LGBM+CatBoost,0.73664,0.630759,0.751306,0.641745,0.50063,0.637401
4,LGBM+XGBoost+CatBoost,0.735509,0.629344,0.750094,0.641142,0.50021,0.636978
5,LGBM+XGBoost+CatBoost+Logistic+TabNet,0.717469,0.601284,0.725423,0.643955,0.498372,0.636112
6,LGBM+CatBoost+RF,0.71649,0.611727,0.735508,0.632298,0.500614,0.635723
7,LGBM+XGBoost+CatBoost+RF+Logistic,0.706565,0.598204,0.723432,0.632298,0.500205,0.635407
8,LGBM+XGBoost+CatBoost+RF+Logistic+TabNet,0.705736,0.591004,0.716756,0.638127,0.498398,0.635077
9,LGBM+XGBoost+CatBoost+Logistic,0.7185,0.60899,0.732607,0.63682,0.498473,0.634897


## 소프트 보팅으로 혼동행렬, ROC 그래프 그리기

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc
)
import matplotlib.pyplot as plt
import pandas as pd

# 소프트 보팅 모델 구성
voting_clf = VotingClassifier(
    estimators=[
        ('lgb', model_lgb),
        ('xgb', model_xgb),
        ('cat', model_cat),
        ('rf', model_rf),
        ('lr', model_lr)
    ],
    voting='soft'
)

# 학습 및 예측
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
y_proba = voting_clf.predict_proba(X_test)[:, 1]  # 양성 클래스 확률

# 평가 지표 계산
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# 결과 표로 출력
result_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Score': [acc, prec, rec, f1]
})
print(result_df)

# Confusion Matrix 시각화
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Soft Voting")
plt.show()

# ROC Curve 시각화
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Soft Voting Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## SHAP

In [None]:
import shap

# Tree SHAP Explainer for LightGBM 모델
explainer_lgb = shap.Explainer(model_lgb)

# SHAP 값 계산 (테스트 데이터 기준)
shap_values_lgb = explainer_lgb(X_test)

# SHAP summary plot (전체 변수 영향력 시각화)
shap.summary_plot(shap_values_lgb, X_test)