In [43]:
import pandas as pd
path = 'data/datasets.csv'
data = pd.read_csv(path)

In [44]:
data.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [45]:
drop_cols = ['EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours']
data.drop(columns=drop_cols, inplace=True)
data.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [46]:
# 타겟값 변환:각 범주형 변수 수치 매핑
data['Attrition'] = data['Attrition'].map({'Yes':1,'No':0})

data['BusinessTravel'] = data['BusinessTravel'].map({'Non-Travel':0,'Travel_Frequently':1,'Travel_Rarely':2})
data['Gender'] = data['Gender'].map({'Female': 0,'Male': 1})

In [47]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(columns=['Attrition']).columns.tolist()
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

In [49]:
import numpy as np
params = {
    'classifier__var_smoothing': np.logspace(-12, -6, 10)
}

In [50]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

In [51]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='Attrition')
y = data['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

In [61]:
from sklearn.model_selection import GridSearchCV

best_acc = 0.0
best_clf = None
best_model_name = ""
best_params = {}

grid_search = GridSearchCV(estimator=nb_clf, param_grid=params, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# 가장 좋은 성능의 모델과 파라미터 저장
if grid_search.best_score_ > best_acc:
    best_acc = grid_search.best_score_
    best_clf = grid_search.best_estimator_
    best_params = grid_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [62]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


# 3. 최적의 모델로 예측 및 평가
y_pred = best_clf.predict(X_test)
y_proba = best_clf.predict_proba(X_test)[:, 1]  # Yes일 확률

# 평가 출력
print(f"Model: nb_clf")
print(f"Best Hyperparameters: {best_params}")
print(classification_report(y_test, y_pred))
print("==============================")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("==============================")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

Model: nb_clf
Best Hyperparameters: {'classifier__var_smoothing': np.float64(1e-12)}
              precision    recall  f1-score   support

           0       0.92      0.65      0.76       247
           1       0.28      0.72      0.40        47

    accuracy                           0.66       294
   macro avg       0.60      0.69      0.58       294
weighted avg       0.82      0.66      0.70       294

Accuracy: 0.6598639455782312
ROC AUC: 0.7424


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# 확률 예측 (Attrition=1일 확률)
y_proba = best_clf.predict_proba(X_test)[:, 1]
def evaluate_threshold(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    print(f"🔻 Threshold = {threshold:.3f}")
    print(f"accuracy:{accuracy_score(y_test,y_pred)}")
    # print(classification_report(y_true, y_pred, zero_division=0))
    # print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.4f}")
for t in [0.97, 0.975, 0.98, 0.985, 0.99]:
    evaluate_threshold(y_test, y_proba, threshold=t)



🔻 Threshold = 0.970
accuracy:0.8469387755102041
🔻 Threshold = 0.975
accuracy:0.8503401360544217
🔻 Threshold = 0.980
accuracy:0.8537414965986394
🔻 Threshold = 0.985
accuracy:0.8469387755102041
🔻 Threshold = 0.990
accuracy:0.8367346938775511
