In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import f1_score, roc_auc_score, r2_score, mean_squared_error

In [17]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [18]:
le = LabelEncoder()
for col in df.select_dtypes(include="object"):
    df[col] = le.fit_transform(df[col])

In [19]:
df = df.drop(columns=["EmployeeNumber", "StandardHours", "Over18", "EmployeeCount"])

In [20]:
X_class = df.drop(columns=["Attrition"])
y_class = df["Attrition"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]

print("LogReg F1 Score:", f1_score(y_test, y_pred))
print("LogReg AUC-ROC:", roc_auc_score(y_test, y_proba))

LogReg F1 Score: 0.4745762711864407
LogReg AUC-ROC: 0.7712418300653595


In [23]:
df["Increment"] = df["PerformanceRating"].apply(lambda x: 1.10 if x == 4 else 1.05)
df["FutureSalary"] = df["MonthlyIncome"] * df["Increment"]

In [35]:
stay_proba = 1 - logreg.predict_proba(X_class)[:, 1]
df["StayProb"] = stay_proba
df["LikelyToStay"] = df["StayProb"] > 0.6

In [25]:
df_reg = df[df["LikelyToStay"] == True]
X_reg = df_reg.drop(columns=["Attrition", "Increment", "FutureSalary", "StayProb", "LikelyToStay"])
y_reg = df_reg["FutureSalary"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)


In [26]:
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)

print("R² Score:", r2_score(y_test_r, y_pred_r))
print("RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_r)))

R² Score: 0.9991852902781683
RMSE: 154.48533501181294


In [34]:
df["ExpectedLoss"] = logreg.predict_proba(X_class)[:, 1] * df["FutureSalary"]
total_loss = df["ExpectedLoss"].sum()

print("Estimated Total Financial Loss due to Attrition: ₹", round(total_loss, 2))

Estimated Total Financial Loss due to Attrition: ₹ 1242813.01
