# 기본 데이터

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm

# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = data.dropna()
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11396, number of negative: 28405
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5525
[LightGBM] [Info] Number of data points in the train set: 39801, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.286324 -> initscore=-0.913303
[LightGBM] [Info] Start training from score -0.913303


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.723445,0.58319,0.11934,0.198135,0.6847
RandomForestClassifier,0.721937,0.610811,0.079326,0.140416,0.659522
GradientBoostingClassifier,0.721536,0.631757,0.065637,0.118919,0.676753
XGBClassifier,0.716913,0.514414,0.200421,0.288457,0.66467
LogisticRegression,0.713094,0.483516,0.030888,0.058067,0.618681
DecisionTreeClassifier,0.620943,0.348639,0.373113,0.360461,0.546737


# 이상치 제거, 결측치 보완 및 제거

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm


# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = eda.preprocessing(data)
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11639, number of negative: 28857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5250
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287411 -> initscore=-0.907991
[LightGBM] [Info] Start training from score -0.907991


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.725432,0.609428,0.124399,0.206621,0.679989
GradientBoostingClassifier,0.721383,0.642173,0.069072,0.124729,0.67385
XGBClassifier,0.721185,0.539367,0.204811,0.296887,0.657613
RandomForestClassifier,0.717432,0.565684,0.072509,0.128541,0.656411
LogisticRegression,0.71042,0.45339,0.03677,0.068023,0.613373
DecisionTreeClassifier,0.623704,0.349297,0.358419,0.353799,0.54456


# 상관관계 높은 컬럼 병합

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm


# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = eda.preprocessing(data)
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_train_pca, X_test_pca = eda.pca_merge_correlated_columns(X_train, X_test,0.1)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_pca), columns=X_train_pca.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_pca), columns=X_test_pca.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11639, number of negative: 28857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1365
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287411 -> initscore=-0.907991
[LightGBM] [Info] Start training from score -0.907991


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.720099,0.588785,0.086598,0.150989,0.638848
GradientBoostingClassifier,0.716938,0.58871,0.050172,0.092464,0.638272
LogisticRegression,0.712,0.463415,0.013058,0.025401,0.583056
XGBClassifier,0.710519,0.487148,0.13677,0.213577,0.62117
RandomForestClassifier,0.709531,0.477698,0.114089,0.184189,0.61234
DecisionTreeClassifier,0.612148,0.33179,0.344674,0.338109,0.532351


# XGBoost 가 선택한 4컬럼으로 측정한 모델

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm


# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = eda.preprocessing(data)
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
X = X[['CurrentEquipmentDays','MonthsInService','HandsetRefurbished','AdjustmentsToCreditRating']]
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train_pca, X_test_pca = eda.pca_merge_correlated_columns(X_train, X_test,0.1)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_pca), columns=X_train_pca.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_pca), columns=X_test_pca.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11639, number of negative: 28857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287411 -> initscore=-0.907991
[LightGBM] [Info] Start training from score -0.907991


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.711704,0.32,0.002749,0.005451,0.57524
GradientBoostingClassifier,0.711506,0.176471,0.001031,0.00205,0.604493
LGBMClassifier,0.711506,0.296296,0.002749,0.005448,0.601675
XGBClassifier,0.710519,0.367089,0.009966,0.019404,0.602371
DecisionTreeClassifier,0.661926,0.329114,0.169759,0.223985,0.550176
RandomForestClassifier,0.656593,0.33135,0.191409,0.242649,0.561594


# LGBM 이 선택한 컬럼

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm


# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = eda.preprocessing(data)
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
X = X[[
    'PercChangeMinutes', 
    'ServiceArea', 
    'PercChangeRevenues', 
    'MonthlyMinutes', 
    'CurrentEquipmentDays', 
    'MonthlyRevenue', 
    'OffPeakCallsInOut', 
    'MonthsInService'
]]
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train_pca, X_test_pca = eda.pca_merge_correlated_columns(X_train, X_test,0.1)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_pca), columns=X_train_pca.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_pca), columns=X_test_pca.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11639, number of negative: 28857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1019
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287411 -> initscore=-0.907991
[LightGBM] [Info] Start training from score -0.907991


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.717136,0.563889,0.069759,0.124159,0.620406
GradientBoostingClassifier,0.715457,0.571429,0.039863,0.074526,0.622673
LogisticRegression,0.712,0.285714,0.001375,0.002736,0.569801
XGBClassifier,0.709728,0.477237,0.104467,0.171412,0.598869
RandomForestClassifier,0.702321,0.443844,0.141237,0.214286,0.58605
DecisionTreeClassifier,0.610173,0.323459,0.32646,0.324953,0.525531


# Decision Tree 선택 2컬럼 모델

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import pre_data as eda
import statsmodels.api as sm


# 여러 분류 모델을 비교하기 위한 성능 지표 계산 함수
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# 모델 리스트
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]

# 2. 데이터 준비
data = pd.read_csv(
        './data/train.csv',
        encoding='cp949',
)

preprocessed_data = eda.preprocessing(data)
# 데이터셋 불러오기 및 전처리

object_columns = preprocessed_data.select_dtypes(include=['object'])
convert_data, _ = eda.convert_category_into_integer(preprocessed_data, object_columns)

# 타겟 변수 및 독립 변수 설정
X = convert_data.drop('Churn', axis=1).astype(float)
X = X[['CurrentEquipmentDays', 'MonthsInService']]
y = convert_data['Churn'].astype(float)  # 이진 분류

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train_pca, X_test_pca = eda.pca_merge_correlated_columns(X_train, X_test,0.1)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_pca), columns=X_train_pca.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_pca), columns=X_test_pca.columns)

# 인덱스 재설정 (reset_index)
X_train = X_train_scaled.reset_index(drop=True)
X_test = X_test_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# 3. 모델 성능 평가 및 결과 저장
results = []

for name, model in models:
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Model': name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# 4. 결과 출력
# 결과 데이터프레임을 보기 좋게 정렬하기
results_df = results_df.set_index('Model')
results_df = results_df.sort_values(by='accuracy', ascending=False)
display(results_df)

[LightGBM] [Info] Number of positive: 11639, number of negative: 28857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 40496, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287411 -> initscore=-0.907991
[LightGBM] [Info] Start training from score -0.907991


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.712593,0.0,0.0,0.0,0.596655
GradientBoostingClassifier,0.712198,0.166667,0.000344,0.000686,0.596958
XGBClassifier,0.712099,0.467532,0.012371,0.024104,0.599453
LogisticRegression,0.711506,0.210526,0.001375,0.002731,0.567557
DecisionTreeClassifier,0.668642,0.324941,0.141924,0.19756,0.554786
RandomForestClassifier,0.663605,0.332884,0.169759,0.224852,0.565857
