In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import statsmodels.api as sm

data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

import pre_data as eda
# 상위 디렉토리를 경로에 추가 (test.ipynb 파일 기준으로)
preprocessed_data = eda.preprocessing(data)
preprocessed_data

# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
# 데이터셋 불러오기 및 전처리 (예시)
# X = ...  # feature 데이터
# y = ...  # 타겟 데이터
object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터 스케일링 (특성값의 범위를 조정)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
print(results_df)

# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
results_df


nan-value filled
MonthlyRevenue : 50887, 삭제된 수 : 1
MonthlyMinutes : 50883, 삭제된 수 : 4
TotalRecurringCharge : 50874, 삭제된 수 : 9
DirectorAssistedCalls : 50874, 삭제된 수 : 0
OverageMinutes : 50870, 삭제된 수 : 4
RoamingCalls : 50862, 삭제된 수 : 8
PercChangeMinutes : 50860, 삭제된 수 : 2
DroppedCalls : 50856, 삭제된 수 : 4
BlockedCalls : 50852, 삭제된 수 : 4
UnansweredCalls : 50842, 삭제된 수 : 10
CustomerCareCalls : 50840, 삭제된 수 : 2
ThreewayCalls : 50836, 삭제된 수 : 4
ReceivedCalls : 50830, 삭제된 수 : 6
OutboundCalls : 50827, 삭제된 수 : 3
InboundCalls : 50823, 삭제된 수 : 4
PeakCallsInOut : 50821, 삭제된 수 : 2
OffPeakCallsInOut : 50806, 삭제된 수 : 15
DroppedBlockedCalls : 50792, 삭제된 수 : 14
CallForwardingCalls : 50779, 삭제된 수 : 13
CallWaitingCalls : 50779, 삭제된 수 : 0
MonthsInService : 50778, 삭제된 수 : 1
UniqueSubs : 50777, 삭제된 수 : 1
ActiveSubs : 50777, 삭제된 수 : 0
Handsets : 50773, 삭제된 수 : 4
HandsetModels : 50773, 삭제된 수 : 0
CurrentEquipmentDays : 50768, 삭제된 수 : 5
RetentionCalls : 50634, 삭제된 수 : 134
RetentionOffersAccepted : 50634, 삭제된 수 : 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 11644, number of negative: 28852
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5198
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287535 -> initscore=-0.907388
[LightGBM] [Info] Start training from score -0.907388
                        Model  accuracy  precision    recall        f1  \
0          LogisticRegression  0.709728   0.306818  0.009294  0.018042   
1      DecisionTreeClassifier  0.622914   0.350866  0.369707  0.360040   
2      RandomForestClassifier  0.719111   0.571765  0.083649  0.145946   
3  GradientBoostingClassifier  0.722074   0.659649  0.064716  0.117868   
4               XGBClassifier  0.708840   0.481887  0.196902  0.279570  

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.723556,0.585484,0.124957,0.205957,0.674597
GradientBoostingClassifier,0.722074,0.659649,0.064716,0.117868,0.667976
RandomForestClassifier,0.719111,0.571765,0.083649,0.145946,0.653867
LogisticRegression,0.709728,0.306818,0.009294,0.018042,0.591143
XGBClassifier,0.70884,0.481887,0.196902,0.27957,0.656029
DecisionTreeClassifier,0.622914,0.350866,0.369707,0.36004,0.54725


Collecting shap
  Downloading shap-0.46.0-cp39-cp39-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba (from shap)
  Downloading numba-0.60.0-cp39-cp39-win_amd64.whl.metadata (2.8 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->shap)
  Downloading llvmlite-0.43.0-cp39-cp39-win_amd64.whl.metadata (4.9 kB)
Downloading shap-0.46.0-cp39-cp39-win_amd64.whl (456 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading numba-0.60.0-cp39-cp39-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 2.7/2.7 MB 52.7 MB/s eta 0:00:00
Downloading llvmlite-0.43.0-cp39-cp39-win_amd64.whl (28.1 MB)
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ----- ---------------------------------- 4.2/28.1 MB 20.9 MB/s eta 0:00:02
   -------------- ------------------------- 10.2/28.1 MB 24.5 MB/s eta

In [3]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3
Note: you may need to restart the kernel to use updated packages.
