In [8]:
import pandas as pd
path = 'data/datasets.csv'
data = pd.read_csv(path)

In [9]:
# 잘 load 되었는지 확인
data.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [10]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [11]:
drop_cols = ['EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours']
data.drop(columns=drop_cols, inplace=True)
data.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [12]:
# 타겟값 변환:각 범주형 변수 수치 매핑
data['Attrition'] = data['Attrition'].map({'Yes':1,'No':0})

data['BusinessTravel'] = data['BusinessTravel'].map({'Non-Travel':0,'Travel_Frequently':1,'Travel_Rarely':2})
data['Gender'] = data['Gender'].map({'Female': 0,'Male': 1})
data['OverTime'] = data['OverTime'].map({'No': 0,'Yes': 1})

In [13]:
import numpy as np

data['TotalWorkingYears'] = data['TotalWorkingYears'].replace(0, np.nan)
data['HighPerformerQuit'] = ((data['PerformanceRating'] == 4) & 
                             (data['Attrition'] == 1)).astype(int)                # 고성과 이탈자 여부
data['JobHopFrequency'] = data['NumCompaniesWorked'] / data['TotalWorkingYears'] # 연평균 이직 횟수
data['PromotionDelay'] = data['YearsAtCompany'] - data['YearsSinceLastPromotion']# 승진까지 소요 기간
data['OverworkedPoorBalance'] = ((data['OverTime'] == 'Yes') & 
                                 (data['WorkLifeBalance'] <= 2)).astype(int)      # 워라밸 붕괴 여부
data['OverallSatisfaction'] = (
    data['EnvironmentSatisfaction'] + 
    data['JobSatisfaction'] + 
    data['RelationshipSatisfaction']
) / 3                                                                             # 평균 만족도
manager_roles = ['Manager', 'Research Director']
data['Is_Manager'] = ((data['JobRole'].isin(manager_roles)) & 
                      (data['JobLevel'] >= 3)).astype(int)                        # 고위직 여부

In [14]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(columns=['Attrition']).columns.tolist()
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

In [16]:
import numpy as np
params = {
    'classifier__n_estimators': [200,300,400,500],
    'classifier__learning_rate': [1e-2,1e-1,],
    'classifier__max_depth': [1,2,3,4]
}

In [17]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

xgb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

In [18]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='Attrition')
y = data['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

In [19]:
from sklearn.model_selection import GridSearchCV

best_acc = 0.0
best_clf = None
best_model_name = ""
best_params = {}
scoring_list = ['accuracy','recall','f1']
scoring = scoring_list[2]
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=params, cv=5, scoring=scoring , verbose=1)
grid_search.fit(X_train, y_train)

# 가장 좋은 성능의 모델과 파라미터 저장
if grid_search.best_score_ > best_acc:
    best_acc = grid_search.best_score_
    best_clf = grid_search.best_estimator_
    best_params = grid_search.best_params_

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [20]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


# 3. 최적의 모델로 예측 및 평가
y_pred = best_clf.predict(X_test)
y_proba = best_clf.predict_proba(X_test)[:, 1]  # Yes일 확률

# 평가 출력
print(f"Model: xgb_clf with {scoring}")
print(f"Best Hyperparameters: {best_params}")
print(classification_report(y_test, y_pred))
print("==============================")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("==============================")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

Model: xgb_clf with f1
Best Hyperparameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 1, 'classifier__n_estimators': 400}
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       247
           1       0.68      0.32      0.43        47

    accuracy                           0.87       294
   macro avg       0.78      0.65      0.68       294
weighted avg       0.85      0.87      0.85       294

Accuracy: 0.8673469387755102
ROC AUC: 0.8370


In [21]:
from sklearn.metrics import classification_report, roc_auc_score

# 확률 예측 (Attrition=1일 확률)
y_proba = best_clf.predict_proba(X_test)[:, 1]

best_t = 0.0
best_t_acc = 0.0 

def evaluate_threshold(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    return y_pred

for t in range(1000):
    t = float(t)/1000
    y_pred = evaluate_threshold(y_test, y_proba, threshold=t)
    if accuracy_score(y_test,y_pred)>best_t_acc:
        best_t = t
        best_t_acc = accuracy_score(y_test,y_pred)

print(f"scoring:{scoring}")
print(f"🔻 best Threshold = {best_t:.3f}")
print(f"best accuracy:{best_t_acc}")

scoring:f1
🔻 best Threshold = 0.361
best accuracy:0.8877551020408163
