# 모델들을 한 파일에 담아서 봅시다

### 모델 순서 
1. 로지스틱
2. 의사결정트리
3. 랜덤포레스트
4. 그라이언트 부스팅
5. XGBoost

*다른 것들은 생략하고 일단 모델들과 점수만 봅시다.*

In [23]:
#데이터 용 pandas
import pandas as pd
import numpy as np
#데이터 시각화
import matplotlib.pyplot as plt
import seaborn as sns
#모델 split용도
from sklearn.model_selection import train_test_split, GridSearchCV
#의사결정 트리
from sklearn.tree import DecisionTreeClassifier
#랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
#그레디언트 부스팅
from sklearn.ensemble import GradientBoostingClassifier
#XGboost
import xgboost as xgb
#모델
from sklearn.preprocessing import StandardScaler, LabelEncoder
#로지스틱회귀
from sklearn.linear_model import LogisticRegression
#모델 점수 확인
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [16]:
data = pd.read_csv("Model_Preprocess.csv")

In [17]:
data.columns

Index(['Complains', 'Charge Amount', 'Distinct Called Numbers', 'Tariff Plan',
       'Status', 'Customer Value', 'Churn', 'Usage Index'],
      dtype='object')

In [18]:
data['Usage Index'].describe()

count    3.030000e+03
mean     8.911097e-17
std      2.289222e+00
min     -2.986032e+00
25%     -1.878309e+00
50%     -5.465197e-01
75%      1.579408e+00
max      5.888933e+00
Name: Usage Index, dtype: float64

In [19]:

# 그래프에서 한글이 깨지면 사용합니다.
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

In [20]:
# 타겟 변수와 피쳐 변수로 분할
X = data.drop(columns=['Churn'])
y = data['Churn']

### 1. 로지스틱

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}  
Accuracy: 0.91

In [24]:
#로지스틱회귀
target_column = 'Churn'

# 범주형 변수 인코딩
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    if column != target_column:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# 타겟 변수 인코딩
label_encoders[target_column] = LabelEncoder()
data[target_column] = label_encoders[target_column].fit_transform(data[target_column])

# 특징 변수와 타겟 변수 분리
X = data.drop(columns=[target_column])
y = data[target_column]

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

# 특징 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 로지스틱 회귀 모델 및 하이퍼파라미터 튜닝
param_grid = {
    'C': [0.1, 1, 10, 100],  # 정규화 강도
    'penalty': ['l1', 'l2'],  # 페널티 유형
    'solver': ['liblinear']  # 최적화 알고리즘
}

grid_search = GridSearchCV(LogisticRegression(random_state=4), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 최적의 모델 선택
best_model = grid_search.best_estimator_

# 모델 예측
y_pred = best_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.91
Confusion Matrix:
[[775   4]
 [ 77  53]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       779
           1       0.93      0.41      0.57       130

    accuracy                           0.91       909
   macro avg       0.92      0.70      0.76       909
weighted avg       0.91      0.91      0.90       909



### 2. 의사결정 트리 

Best Hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10}   
Accuracy: 0.911991199119912

In [25]:
# 타겟 변수와 피쳐 변수로 분할
X = data.drop(columns=['Churn'])
y = data['Churn']

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

# 의사결정 트리 분류모델 초기화
dt_classifier = DecisionTreeClassifier(random_state=4)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
best_params = grid_search.best_params_

# 최적의 하이퍼파라미터로 모델 생성 및 학습
best_dt_classifier = DecisionTreeClassifier(**best_params, random_state=4)
best_dt_classifier.fit(X_train, y_train)

# 예측 수행
y_pred = best_dt_classifier.predict(X_test)

# 모델 평가
accuracy_dt = accuracy_score(y_test, y_pred)
report_dt = classification_report(y_test, y_pred)

print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy: {accuracy_dt}')
print('Classification Report:')
print(report_dt)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10}
Accuracy: 0.911991199119912
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       779
           1       0.77      0.55      0.64       130

    accuracy                           0.91       909
   macro avg       0.85      0.76      0.79       909
weighted avg       0.91      0.91      0.91       909





### 3. 랜덤 포레스트

Best Hyperparameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}  
Accuracy: 0.935093509350935

In [27]:

# 타겟 변수와 피쳐 변수로 분할
X = data.drop(columns=['Churn'])
y = data['Churn']


# # 랜덤 포레스트 분류기 초기화
rf_classifier = RandomForestClassifier(random_state=4)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# # GridSearchCV를 사용한 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
best_params = grid_search.best_params_

# 최적의 하이퍼파라미터로 모델 생성 및 학습
best_rf_classifier = RandomForestClassifier(**best_params, random_state=4)
best_rf_classifier.fit(X_train, y_train)

# 예측 수행
y_pred = best_rf_classifier.predict(X_test)

# 모델 평가
accuracy_rf = accuracy_score(y_test, y_pred)
report_rf = classification_report(y_test, y_pred)

print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy: {accuracy_rf}')
print('Classification Report:')
print(report_rf)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  warn(
  warn(


Best Hyperparameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.935093509350935
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       779
           1       0.83      0.68      0.75       130

    accuracy                           0.94       909
   macro avg       0.89      0.83      0.86       909
weighted avg       0.93      0.94      0.93       909



### 4. 그라이언트 부스팅

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}  
Accuracy: 0.9284928492849285

In [28]:
# 타겟 변수와 피쳐 변수로 분할
X = data.drop(columns=['Churn'])
y = data['Churn']

# 그라디언트 부스팅 분류기 초기화
gb_classifier = GradientBoostingClassifier(random_state=4)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
best_params = grid_search.best_params_

# 최적의 하이퍼파라미터로 모델 생성 및 학습
best_gb_classifier = GradientBoostingClassifier(**best_params, random_state=4)
best_gb_classifier.fit(X_train, y_train)

# 예측 수행
y_pred = best_gb_classifier.predict(X_test)

# 모델 평가
accuracy_gb = accuracy_score(y_test, y_pred)
report_gb = classification_report(y_test, y_pred)

print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy: {accuracy_gb}')
print('Classification Report:')

print(report_gb)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits




Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.9284928492849285
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       779
           1       0.79      0.68      0.73       130

    accuracy                           0.93       909
   macro avg       0.87      0.83      0.85       909
weighted avg       0.93      0.93      0.93       909





### 5. XGBoosting

XGBoosting은 최적의 하이퍼 파라미터 찾는 과정이 아직 없습니다.

XGBoost Accuracy: 0.9130913091309131

In [29]:

# XGBoost 분류기 초기화 및 학습
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.2,
    max_depth=10,
    n_estimators=100,
    min_child_weight=4,
    gamma=10,
    colsample_bytree=0.75,
    random_state=4
)
xgb_classifier.fit(X_train, y_train)

# 예측 수행
y_pred = xgb_classifier.predict(X_test)

# 모델 평가
accuracy_xgb = accuracy_score(y_test, y_pred)
report_xgb = classification_report(y_test, y_pred)

print(f'XGBoost Accuracy: {accuracy_xgb}')
print('XGBoost Classification Report:')
print(report_xgb)

XGBoost Accuracy: 0.9130913091309131
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       779
           1       0.93      0.42      0.58       130

    accuracy                           0.91       909
   macro avg       0.92      0.71      0.77       909
weighted avg       0.91      0.91      0.90       909

