In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.combine import SMOTE
from joblib import dump

# Load data
df = pd.read_csv("employee_attrition.csv")
df.drop_duplicates(subset=["EmployeeID"], inplace=True)
df["status_resign"] = np.where(df["termreason_desc"] == "Resignaton", 1, 0)

# Drop kolom tidak perlu
df = df.set_index("EmployeeID").drop(columns=[
    "recorddate_key", "birthdate_key", "orighiredate_key", "STATUS", "STATUS_YEAR",
    "gender_short", "termtype_desc", "store_name", "termreason_desc", "terminationdate_key"
])

# Features & Target
X = df.drop(columns=["status_resign"])
y = df["status_resign"]

# Feature engineering minimal
X["dept_risk"] = X["department_name"].isin(["Customer Service", "Produce", "Bakery"]).astype(int)
X["city_risk"] = X["city_name"].apply(lambda x: 2 if x == "Valemont" else 1)
X["job_risk"] = X["job_title"].apply(lambda x: 2 if x == "VP Stores" else 1)
X["usia_risk"] = ((X["age"] >= 8) & (X["age"] <= 26)).astype(int)

# Drop kolom kategori
X.drop(columns=["department_name", "city_name", "job_title"], inplace=True)

# Encoding
X = pd.get_dummies(X, columns=["BUSINESS_UNIT", "gender_full"])

# Scaling
scaler = StandardScaler()
X["length_of_service"] = scaler.fit_transform(X[["length_of_service"]])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE + Tomek
smote = SMOTE(random_state=42)
X_train, y_train = smt.fit_resample(X_train, y_train)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
dump(model, "model_rf.joblib")
dump(scaler, "scaler.joblib")

# Optional: evaluasi
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1257
           1       0.92      0.90      0.91        40

    accuracy                           0.99      1297
   macro avg       0.96      0.95      0.95      1297
weighted avg       0.99      0.99      0.99      1297

