The above code uses the Random Forest Classifier to predict employee turnover, which is a binary classification problem. You can also try different classifiers like Logistic Regression, Decision Tree, etc. The code uses the train_test_split function to split the data into training and testing sets. The model is trained on the training data, and then its predictions are compared to the actual values in the test data to calculate the accuracy. You can also use other evaluation metric like f1_score, precision, recall, AUC_ROC, etc. It also prints the confusion matrix which is a good evaluation metric for binary classification problem.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score

# Load the data
data = pd.read_csv('HRDataset_v14.csv')

# Data Cleaning
data = data.dropna()
data = data.drop_duplicates()

# EDA
data.describe()
data.info()
data.isna().sum()

# Define the target variable (employee turnover)
target = data['status']

# Define the features (all columns except the target variable)
features = data.drop(columns=['status'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

# Initialize the model
model_rf = RandomForestClassifier(n_estimators=100)
model_lr = LogisticRegression()
model_dt = DecisionTreeClassifier()

# Train the model on the training data
model_rf.fit(X_train, y_train)
model_lr.fit(X_train, y_train)
model_dt.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = model_rf.predict(X_test)
y_pred_lr = model_lr.predict(X_test)
y_pred_dt = model_dt.predict(X_test)

# Calculate the accuracy of the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Random Forest: ", accuracy_rf)
print("Accuracy of Logistic Regression: ", accuracy_lr)
print("Accuracy of Decision Tree: ", accuracy_dt)

# Calculate other Evaluation Metrics
f1_rf = f1_score(y_test, y_pred_rf)
f1_lr = f1_score(y_test, y_pred_lr)
f1_dt = f1_score(y_test, y_pred_dt)
print("F1 score of Random Forest: ", f1_rf)
print("F1 score of Logistic Regression: ", f1_lr)
print("F1 score of Decision Tree: ", f1_dt)

precision_rf = precision_score(y_test, y_pred_rf)
precision_lr = precision_score(y_test, y_pred_lr)
precision
