In [None]:
# import module
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline


print('numpy version : ',np.__version__)
print('pandas version : ',pd.__version__)

numpy version :  2.2.6
pandas version :  2.3.3
seaborn version :  0.13.2


# Feature Engineering

In [2]:
df=pd.read_csv("../dataset/employee_churn_prediction_updated.csv")

In [4]:
anomaly_experience = df['experience_years'] > (df['age'] - 18)
anomalies = df[anomaly_experience & df['experience_years'].notna() & df['age'].notna()]
anomalies[['employee_id', 'age', 'experience_years']]

Unnamed: 0,employee_id,age,experience_years
17,18,23,8
20,21,23,7
35,36,24,7
70,71,23,7
90,91,22,9
...,...,...,...
959,960,22,7
968,969,22,8
981,982,22,6
982,983,26,9


In [5]:
df = df[~anomaly_experience].copy()

In [6]:
df_original_sensitive = df[['gender', 'education', 'marital_status', 'work_location', 'age']].copy()

In [7]:
# Transform gender
df['gender_Male'] = (df['gender'] == 'Male').astype(int)
df['gender_Male'].head()

# labeling aducation
edu_ordinal= {'High School': 0, 'Diploma': 1, 'Bachelor': 2}
df['education_ord'] = df['education'].map(edu_ordinal).astype(int)
df['education_ord'].unique()

# transform martial_status
df['is_married'] = (df['marital_status'] == 'Married').astype(int)
df['is_married'].head()

# one-hot work_location
work_location_dummies = pd.get_dummies(df['work_location'], prefix='work_location', dtype=int)
df = pd.concat([df, work_location_dummies], axis=1)
display(df.head())

# Memberi gambaran tentang tekanan kerja per minggu.
df['total_workload'] = (df['working_hours_per_week']) + (df['overtime_hours_per_week'])

# Karyawan yang melebihi target (>1.1) + lembur tinggi (>10 jam).
df['overachiever_high_overtime'] = (df['target_achievement'] > 1.1) & (df['overtime_hours_per_week'] > 10)

# Mengukur efisiensi — bukan hanya hasil, tapi seberapa hemat waktu mencapainya.
df['performance_efficiency'] = df['target_achievement'] / (df['total_workload'] + 1)

# Mengukur “nilai” gaji terhadap ekspektasi kerja — bisa jadi indikator ketidakpuasan.
df['salary_to_target_ratio'] = df['salary'] / (df['monthly_target'] + 1)

# Mengukur apakah gaji sebanding dengan pengalaman — relevan untuk churn karena ketidakadilan.
df['salary_per_exp'] = df['salary'] / (df['experience_years'] + 1)

# Mengukur beban logistik vs beban kerja — semakin tinggi, semakin tidak wajar.
df['long_distance_overwork'] = df['distance_to_office_km'] / (df['working_hours_per_week'] + 1)

# melihat segmen paling rentan churn — prioritas utama untuk intervensi bisnis.
df['high_overtime'] = (df['overtime_hours_per_week'] > df['overtime_hours_per_week'].median()).astype(int)
df['low_satisfaction'] = (df['job_satisfaction'] <= 2).astype(int)
df['high_ot_low_sat'] = df['high_overtime'] & df['low_satisfaction']

q1 = df['age'].quantile(0.33)
q2 = df['age'].quantile(0.66)

# Membagi kelompok usia berdasarkan kuartil
df['age_group'] = pd.cut(df['age'], bins=[22, q1, q2, 44], labels=['Young', 'Middle', 'Senior'])

Unnamed: 0,employee_id,age,gender,education,experience_years,monthly_target,target_achievement,working_hours_per_week,overtime_hours_per_week,salary,...,churn,marital_status,distance_to_office_km,churn_period,gender_Male,education_ord,is_married,work_location_Rural,work_location_Suburban,work_location_Urban
0,1,28,Male,High School,0,153,0.94,62,9,4667108,...,0,Married,22,Stayed,1,0,1,0,1,0
1,2,41,Male,Diploma,6,188,0.54,55,8,5853507,...,1,Single,36,Onboarding,1,1,0,0,0,1
2,3,36,Female,High School,8,159,0.44,59,10,4781336,...,1,Single,17,Onboarding,0,0,0,0,0,1
3,4,32,Male,Diploma,7,185,0.66,68,15,3624588,...,1,Married,32,Onboarding,1,1,1,0,0,1
4,5,29,Female,High School,7,142,1.01,45,9,5154327,...,0,Single,14,Stayed,0,0,0,1,0,0


In [8]:
numerical_cols = ['age', 'experience_years', 'monthly_target', 'target_achievement', 'working_hours_per_week', 'overtime_hours_per_week',
                  'salary', 'commission_rate', 'job_satisfaction', 'manager_support_score', 'company_tenure_years', 'distance_to_office_km',
                  'total_workload','overachiever_high_overtime','performance_efficiency','salary_to_target_ratio','salary_per_exp',
                  'long_distance_overwork','high_overtime','low_satisfaction','high_ot_low_sat']
categorical_cols = ['is_married','work_location_Rural','education_ord','work_location_Urban','work_location_Suburban','gender_Male']
target= 'churn'

X_features = numerical_cols + categorical_cols
y_features = target

from sklearn.model_selection import train_test_split

X = df[X_features]
y = df[y_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # <- bisa disesuaikan masing-masing

In [9]:
selected_features=[
 'performance_efficiency',
 'low_satisfaction',
 'target_achievement',
 'high_ot_low_sat',
 'working_hours_per_week',
 'distance_to_office_km',
 'total_workload',
 'job_satisfaction',
 'long_distance_overwork',
 'company_tenure_years',
 'manager_support_score',
 'is_married'
]
target_col='churn'

In [None]:
import sys
print(sys.executable)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# F2: recall lebih berat
f2_scorer = make_scorer(
    fbeta_score,
    beta=2,
    pos_label=1
)

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

c:\Data Ridho\Final Project\folder_github\my_env\Scripts\python.exe
CatBoost & XGBoost OK


# Training Model

In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
import joblib

# Persiapan data
X = df[selected_features].copy()
y = df[target_col].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Terapkan SMOTE hanya pada data latih
smote = SMOTE(random_state=42, sampling_strategy=0.6)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2, 3],
    "subsample": [0.7, 0.9, 1.0]
}

gb = GradientBoostingClassifier(random_state=42)
gs = GridSearchCV(gb, param_grid, cv=5, scoring='recall', n_jobs=-1)
gs.fit(X_train_bal, y_train_bal)

0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_grid,"{'learning_rate': [0.01, 0.05, ...], 'max_depth': [2, 3], 'n_estimators': [100, 200, ...], 'subsample': [0.7, 0.9, ...]}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.01
,n_estimators,100
,subsample,0.7
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


# Import Model

In [None]:
import joblib

# Simpan model
joblib.dump(best_model, "best_model.pkl")

# Simpan juga median overtime untuk feature engineering
feature_engineering = {
    "overtime_median": df["overtime_hours_per_week"].median()
}

joblib.dump(feature_engineering, "feature_engineering.pkl")