<a href="https://colab.research.google.com/github/Nurochman79/DataScience/blob/main/Pro1_HR_TurnOver_Predictive_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#EDISI NGULIK DATA ini bertujuan membuat model prediksi karyawan akan resign atau tidak berdasarkan data-data histori di HR dengan algoritman RANDOM FOREST

#1. Persiapan awal eksplorasi data-instalasi dan loading data

In [1]:
#Istall Library yang dibutuhkan
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data_hr = pd.read_csv('/content/sample_data/Pro1_HR_TurnOver_Predictive.csv')

#2. Memahai data untuk memastikan data valid dan layak diproses

In [3]:
#Property dataset (Jumlah bari dan kolom)
data_hr.shape
#Hasilnya ada 1499 barsi data dan 10 kolom data

(14999, 10)

In [4]:
#Meta data
data_hr.info()
#Ada data type float, interger, dan object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
#Sample data. Hasil tampilan data yang sangat beragam nilainya antar feature.Data perlu discallling agar nanti hasil modeling proper
data_hr.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
5,0.41,0.5,2,153,3,0,1,0,sales,low
6,0.1,0.77,6,247,4,0,1,0,sales,low
7,0.92,0.85,5,259,5,0,1,0,sales,low
8,0.89,1.0,5,224,5,0,1,0,sales,low
9,0.42,0.53,2,142,3,0,1,0,sales,low


In [6]:
#Identifikasi data NA
data_hr.isna().sum()
#Hasilnya tidak ada nilai NA

Unnamed: 0,0
satisfaction_level,0
last_evaluation,0
number_project,0
average_montly_hours,0
time_spend_company,0
Work_accident,0
left,0
promotion_last_5years,0
sales,0
salary,0


In [7]:
#Identifikasi data Null
data_hr.isnull().sum()
#Hasilnya tidak ada nilai Null

Unnamed: 0,0
satisfaction_level,0
last_evaluation,0
number_project,0
average_montly_hours,0
time_spend_company,0
Work_accident,0
left,0
promotion_last_5years,0
sales,0
salary,0


#3. Pimilihan feature terbaik (Tanpa melakukan feature engineering)

In [8]:
X = data_hr.drop('left', axis=1)
y = data_hr['left']


#4. Proses transformasi data kategoryi ke numerical

In [9]:
cat_features = ['sales', 'salary']
num_features = [
    'satisfaction_level',
    'last_evaluation',
    'number_project',
    'average_montly_hours',
    'time_spend_company',
    'Work_accident',
    'promotion_last_5years'
]


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features),
        ('num', 'passthrough', num_features)
    ]
)


#5. Split dan train data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


#6. Modeling prediksi

In [12]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)


In [13]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])


In [14]:
rf_pipeline.fit(X_train, y_train)


#7. Evaluasi model.
Jika dibandingkan dengan menggunakan algoritman logistic regrassion maka “Random Forest" mendeteksi karyawan berisiko resign dengan akurasi tinggi, sedangkan Logistic Regression digunakan untuk menjelaskan faktor utama penyebab turnover. Randon forest cocok karena feature dataset tidak linear dengan target seperti satisfaction rendah + jam kerja tinggi, lama kerja tertentu + tidak pernah promosi

Lama kerja tertentu + tidak pernah promosi

Department tertentu + salary rendah

In [15]:
y_pred = rf_pipeline.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[2273   13]
 [  49  665]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2286
           1       0.98      0.93      0.96       714

    accuracy                           0.98      3000
   macro avg       0.98      0.96      0.97      3000
weighted avg       0.98      0.98      0.98      3000

