In [3]:
# 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# 데이터 불러오기
df = pd.read_csv("employee_new3.csv",
                 encoding='utf-8',
                 on_bad_lines='skip',  # pandas 1.3 이상
                 engine='python')      # 유연한 파서 사용

# 'Gender'에서 'Other' 제거
df = df[df['Gender'] != 'Other']
df = df.dropna()

# 사용하지 않을 컬럼 제거
df = df.drop(columns=['Employee_ID', 'Hire_Date'])

# 라벨 인코딩
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# X, y 분리
X = df.drop('Resigned', axis=1)
y = df['Resigned'].astype(int)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# SMOTE로 불균형 처리
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# RandomForestClassifier 학습 (빠르게 n_estimators=50)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train_res, y_train_res)

# 예측 및 평가
y_pred = model.predict(X_test)
print("✅ 정확도:", accuracy_score(y_test, y_pred))
print("✅ 정밀도:", precision_score(y_test, y_pred))
print("✅ 재현율:", recall_score(y_test, y_pred))
print("✅ F1 점수:", f1_score(y_test, y_pred))
print("\n📊 상세 분류 리포트:\n", classification_report(y_test, y_pred, target_names=["재직중", "퇴사"]))


✅ 정확도: 0.8569792263237361
✅ 정밀도: 0.3513024602026049
✅ 재현율: 0.5044155844155844
✅ F1 점수: 0.41416080187673276

📊 상세 분류 리포트:
               precision    recall  f1-score   support

         재직중       0.94      0.90      0.92     17282
          퇴사       0.35      0.50      0.41      1925

    accuracy                           0.86     19207
   macro avg       0.65      0.70      0.67     19207
weighted avg       0.88      0.86      0.87     19207

