# 주제선정: ABC 다국적 은행의 가입고객 이탈예측

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.font_manager as fm
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# font_path = '/Library/Fonts/Arial Unicode.ttf'      # mac 폰트 추가
font_path = 'C:\\Windows\\Fonts\\malgun.TTF'          # 한글 폰트를 적용하여 시각화
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font)

# warnings 무시하기
import warnings
warnings.filterwarnings('ignore')

# 새 csv 파일을 호출
df = pd.read_csv("./data/Transformed_Bank_Customer_Churn_Prediction.csv")
df

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,0,1,42,2,0.00,1,1,1,101348.88,1
1,608,2,1,41,1,83807.86,1,0,1,112542.58,0
2,502,0,1,42,8,159660.80,3,1,0,113931.57,1
3,699,0,1,39,1,0.00,2,0,0,93826.63,0
4,850,2,1,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,0,39,5,0.00,2,1,0,96270.64,0
9996,516,0,0,35,10,57369.61,1,1,1,101699.77,0
9997,709,0,1,36,7,0.00,1,0,1,42085.58,1
9998,772,1,0,42,3,75075.31,2,1,0,92888.52,1


---

- SVC

In [2]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 데이터 준비
X = df.drop('churn', axis=1)
y = df['churn']

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standard 정규화
standard_x = StandardScaler()
X_train_standard = standard_x.fit_transform(X_train)
X_test_standard = standard_x.transform(X_test)

# MinMax 정규화
minmax_x = MinMaxScaler()
X_train_minmax = minmax_x.fit_transform(X_train)
X_test_minmax = minmax_x.transform(X_test)

# 모델 학습 및 예측
svr_rbf = SVC(kernel='rbf', C=1.0)
svr_poly = SVC(kernel='poly', C=1.0)

In [3]:
# 원본 데이터

# 학습
svr_rbf.fit(X_train, y_train)
svr_poly.fit(X_train, y_train)

# 예측
rbf_pred = svr_rbf.predict(X_test)
poly_pred = svr_poly.predict(X_test)

# 정확도 (Accuracy)
rbf_accuracy = accuracy_score(y_test, rbf_pred)
poly_accuracy = accuracy_score(y_test, poly_pred)

# 정밀도 (Precision)
rbf_precision = precision_score(y_test, rbf_pred)
poly_precision = precision_score(y_test, poly_pred)

# 재현율 (Recall)
rbf_recall = recall_score(y_test, rbf_pred)
poly_recall = recall_score(y_test, poly_pred)

# F1 Score
rbf_f1 = f1_score(y_test, rbf_pred)
poly_f1 = f1_score(y_test, poly_pred)

# 모델 평가
print(f'정확도: {rbf_accuracy * 100:.4f} %')
print(f"정밀도: {rbf_precision * 100:.4f} %")
print(f'재현율: {rbf_recall * 100:.4f} %')
print(f"F1 Score Rate: {rbf_f1 * 100:.4f} %")

print(f'정확도: {poly_accuracy * 100:.4f} %')
print(f"정밀도: {poly_precision * 100:.4f} %")
print(f'재현율: {poly_recall * 100:.4f} %')
print(f"F1 Score Rate: {poly_f1 * 100:.4f} %")

정확도: 80.5333 %
정밀도: 0.0000 %
재현율: 0.0000 %
F1 Score Rate: 0.0000 %
정확도: 80.5333 %
정밀도: 0.0000 %
재현율: 0.0000 %
F1 Score Rate: 0.0000 %


In [4]:
# Standard 정규화

# 학습
svr_rbf.fit(X_train_standard, y_train)
svr_poly.fit(X_train_standard, y_train)

# 예측
rbf_pred = svr_rbf.predict(X_test_standard)
poly_pred = svr_poly.predict(X_test_standard)

# 정확도 (Accuracy)
rbf_accuracy = accuracy_score(y_test, rbf_pred)
poly_accuracy = accuracy_score(y_test, poly_pred)

# 정밀도 (Precision)
rbf_precision = precision_score(y_test, rbf_pred)
poly_precision = precision_score(y_test, poly_pred)

# 재현율 (Recall)
rbf_recall = recall_score(y_test, rbf_pred)
poly_recall = recall_score(y_test, poly_pred)

# F1 Score
rbf_f1 = f1_score(y_test, rbf_pred)
poly_f1 = f1_score(y_test, poly_pred)

# 모델 평가
print(f'정확도: {rbf_accuracy * 100:.4f} %')
print(f"정밀도: {rbf_precision * 100:.4f} %")
print(f'재현율: {rbf_recall * 100:.4f} %')
print(f"F1 Score Rate: {rbf_f1 * 100:.4f} %")

print(f'정확도: {poly_accuracy * 100:.4f} %')
print(f"정밀도: {poly_precision * 100:.4f} %")
print(f'재현율: {poly_recall * 100:.4f} %')
print(f"F1 Score Rate: {poly_f1 * 100:.4f} %")

정확도: 86.3000 %
정밀도: 83.1418 %
재현율: 37.1575 %
F1 Score Rate: 51.3609 %
정확도: 85.8667 %
정밀도: 87.3832 %
재현율: 32.0205 %
F1 Score Rate: 46.8672 %


In [5]:
# MinMax 정규화

# 학습
svr_rbf.fit(X_train_minmax, y_train)
svr_poly.fit(X_train_minmax, y_train)

# 예측
rbf_pred = svr_rbf.predict(X_test_minmax)
poly_pred = svr_poly.predict(X_test_minmax)

# 정확도 (Accuracy)
rbf_accuracy = accuracy_score(y_test, rbf_pred)
poly_accuracy = accuracy_score(y_test, poly_pred)

# 정밀도 (Precision)
rbf_precision = precision_score(y_test, rbf_pred)
poly_precision = precision_score(y_test, poly_pred)

# 재현율 (Recall)
rbf_recall = recall_score(y_test, rbf_pred)
poly_recall = recall_score(y_test, poly_pred)

# F1 Score
rbf_f1 = f1_score(y_test, rbf_pred)
poly_f1 = f1_score(y_test, poly_pred)

# 모델 평가
print(f'정확도: {rbf_accuracy * 100:.4f} %')
print(f"정밀도: {rbf_precision * 100:.4f} %")
print(f'재현율: {rbf_recall * 100:.4f} %')
print(f"F1 Score Rate: {rbf_f1 * 100:.4f} %")

print(f'정확도: {poly_accuracy * 100:.4f} %')
print(f"정밀도: {poly_precision * 100:.4f} %")
print(f'재현율: {poly_recall * 100:.4f} %')
print(f"F1 Score Rate: {poly_f1 * 100:.4f} %")

정확도: 85.7000 %
정밀도: 86.7299 %
재현율: 31.3356 %
F1 Score Rate: 46.0377 %
정확도: 85.8000 %
정밀도: 85.5856 %
재현율: 32.5342 %
F1 Score Rate: 47.1464 %


- Naive Bayes

In [6]:
from sklearn.naive_bayes import GaussianNB

# 모델 학습 및 예측
gnb = GaussianNB()

In [7]:
# 원본 데이터

# 학습
gnb.fit(X_train, y_train)

# 예측
gnb_pred = gnb.predict(X_test)

# 정확도 (Accuracy)
gnb_accuracy = accuracy_score(y_test, gnb_pred)

# 정밀도 (Precision)
gnb_precision = precision_score(y_test, gnb_pred)

# 재현율 (Recall)
gnb_recall = recall_score(y_test, gnb_pred)

# F1 Score
gnb_f1 = f1_score(y_test, gnb_pred)

# 모델 평가
print(f'정확도: {gnb_accuracy * 100:.4f} %')
print(f"정밀도: {gnb_precision * 100:.4f} %")
print(f'재현율: {gnb_recall * 100:.4f} %')
print(f"F1 Score Rate: {gnb_f1 * 100:.4f} %")

정확도: 79.2000 %
정밀도: 34.3750 %
재현율: 7.5342 %
F1 Score Rate: 12.3596 %


In [8]:
# Standard 정규화

# 학습
gnb.fit(X_train_standard, y_train)

# 예측
gnb_pred = gnb.predict(X_test_standard)

# 정확도 (Accuracy)
gnb_accuracy = accuracy_score(y_test, gnb_pred)

# 정밀도 (Precision)
gnb_precision = precision_score(y_test, gnb_pred)

# 재현율 (Recall)
gnb_recall = recall_score(y_test, gnb_pred)

# F1 Score
gnb_f1 = f1_score(y_test, gnb_pred)

# 모델 평가
print(f'정확도: {gnb_accuracy * 100:.4f} %')
print(f"정밀도: {gnb_precision * 100:.4f} %")
print(f'재현율: {gnb_recall * 100:.4f} %')
print(f"F1 Score Rate: {gnb_f1 * 100:.4f} %")

정확도: 83.4333 %
정밀도: 71.0145 %
재현율: 25.1712 %
F1 Score Rate: 37.1681 %


In [9]:
# MinMax 정규화

# 학습
gnb.fit(X_train_minmax, y_train)

# 예측
gnb_pred = gnb.predict(X_test_minmax)

# 정확도 (Accuracy)
gnb_accuracy = accuracy_score(y_test, gnb_pred)

# 정밀도 (Precision)
gnb_precision = precision_score(y_test, gnb_pred)

# 재현율 (Recall)
gnb_recall = recall_score(y_test, gnb_pred)

# F1 Score
gnb_f1 = f1_score(y_test, gnb_pred)

# 모델 평가
print(f'정확도: {gnb_accuracy * 100:.4f} %')
print(f"정밀도: {gnb_precision * 100:.4f} %")
print(f'재현율: {gnb_recall * 100:.4f} %')
print(f"F1 Score Rate: {gnb_f1 * 100:.4f} %")

정확도: 83.4333 %
정밀도: 71.0145 %
재현율: 25.1712 %
F1 Score Rate: 37.1681 %


- AdaBoost

In [10]:
from sklearn.ensemble import AdaBoostClassifier

# 모델 학습 및 예측
abc = AdaBoostClassifier()

In [11]:
# 원본 데이터

# 학습
abc.fit(X_train, y_train)

# 예측
abc_pred = abc.predict(X_test)

# 정확도 (Accuracy)
abc_accuracy = accuracy_score(y_test, abc_pred)

# 정밀도 (Precision)
abc_precision = precision_score(y_test, abc_pred)

# 재현율 (Recall)
abc_recall = recall_score(y_test, abc_pred)

# F1 Score
abc_f1 = f1_score(y_test, abc_pred)

# 모델 평가
print(f'정확도: {abc_accuracy * 100:.4f} %')
print(f"정밀도: {abc_precision * 100:.4f} %")
print(f'재현율: {abc_recall * 100:.4f} %')
print(f"F1 Score Rate: {abc_f1 * 100:.4f} %")

정확도: 86.0667 %
정밀도: 73.0556 %
재현율: 45.0342 %
F1 Score Rate: 55.7203 %


In [12]:
# Standard 정규화

# 학습
abc.fit(X_train_standard, y_train)

# 예측
abc_pred = abc.predict(X_test_standard)

# 정확도 (Accuracy)
abc_accuracy = accuracy_score(y_test, abc_pred)

# 정밀도 (Precision)
abc_precision = precision_score(y_test, abc_pred)

# 재현율 (Recall)
abc_recall = recall_score(y_test, abc_pred)

# F1 Score
abc_f1 = f1_score(y_test, abc_pred)

# 모델 평가
print(f'정확도: {abc_accuracy * 100:.4f} %')
print(f"정밀도: {abc_precision * 100:.4f} %")
print(f'재현율: {abc_recall * 100:.4f} %')
print(f"F1 Score Rate: {abc_f1 * 100:.4f} %")

정확도: 86.0667 %
정밀도: 73.0556 %
재현율: 45.0342 %
F1 Score Rate: 55.7203 %


In [13]:
# MinMax 정규화

# 학습
abc.fit(X_train_minmax, y_train)

# 예측
abc_pred = abc.predict(X_test_minmax)

# 정확도 (Accuracy)
abc_accuracy = accuracy_score(y_test, abc_pred)

# 정밀도 (Precision)
abc_precision = precision_score(y_test, abc_pred)

# 재현율 (Recall)
abc_recall = recall_score(y_test, abc_pred)

# F1 Score
abc_f1 = f1_score(y_test, abc_pred)

# 모델 평가
print(f'정확도: {abc_accuracy * 100:.4f} %')
print(f"정밀도: {abc_precision * 100:.4f} %")
print(f'재현율: {abc_recall * 100:.4f} %')
print(f"F1 Score Rate: {abc_f1 * 100:.4f} %")

정확도: 86.0667 %
정밀도: 73.0556 %
재현율: 45.0342 %
F1 Score Rate: 55.7203 %


- SVR

In [14]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_squared_log_error, root_mean_squared_log_error, r2_score

# 데이터 준비
X = df.drop('churn', axis=1)
y = df['churn']

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 학습 및 예측
svr_rbf = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_poly = SVR(kernel='poly', C=1.0, epsilon=0.1)

# 학습
svr_rbf.fit(X_train, y_train)
svr_poly.fit(X_train, y_train)

# 예측
rbf_pred = svr_rbf.predict(X_test)
poly_pred = svr_poly.predict(X_test)

# 회귀 평가 지표 계산
rbf_mse = mean_squared_error(y_test, rbf_pred)
rbf_rmse = root_mean_squared_error(y_test, rbf_pred)
rbf_msle = mean_squared_log_error(y_test, rbf_pred)
rbf_rmsle = root_mean_squared_log_error(y_test, rbf_pred)
rbf_r2 = r2_score(y_test, rbf_pred)

poly_mse = mean_squared_error(y_test, poly_pred)
poly_rmse = root_mean_squared_error(y_test, poly_pred)
poly_msle = mean_squared_log_error(y_test, poly_pred)
poly_rmsle = root_mean_squared_log_error(y_test, poly_pred)
poly_r2 = r2_score(y_test, poly_pred)

print("RBF SVR:")
print(f"MSE: {rbf_mse}")
print(f"RMSE: {rbf_rmse}")
print(f"MSLE: {rbf_mse}")
print(f"RMSLE: {rbf_rmse}")
print(f"R2 Score: {rbf_r2}")

print("\nPolynomial SVR:")
print(f"MSE: {poly_mse}")
print(f"RMSE: {poly_rmse}")
print(f"MSLE: {poly_mse}")
print(f"RMSLE: {poly_rmse}")
print(f"R2 Score: {poly_r2}")

RBF SVR:
MSE: 0.16572333406883208
RMSE: 0.40709130925239867
MSLE: 0.16572333406883208
RMSLE: 0.40709130925239867
R2 Score: -0.05710078261042839

Polynomial SVR:
MSE: 0.16573336904810623
RMSE: 0.4071036342850629
MSLE: 0.16573336904810623
RMSLE: 0.4071036342850629
R2 Score: -0.057164792814566656
