In [6]:
# Import Libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pickle




In [7]:
# Load processed data
df = pd.read_csv(r'C:\Users\USER\My notebook\DataSciencePro\employee-attrition-prediction\data\processed\processed_attrition_data.csv')

In [8]:
# Define features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

In [9]:
# Verify the data
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts()}")

Features shape: (1470, 19)
Target shape: (1470,)

Target distribution:
Attrition
0    1233
1     237
Name: count, dtype: int64


### All informations are intact as processed in the preprocessing.py script


In [10]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)

In [11]:
# Verify split sizes
print(f"Training set size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining set class distribution:\n{y_train.value_counts()}")
print(f"\nTest set class distribution:\n{y_test.value_counts()}")

Training set size: 1176 samples (80.0%)
Test set size: 294 samples (20.0%)

Training set class distribution:
Attrition
0    980
1    196
Name: count, dtype: int64

Test set class distribution:
Attrition
0    253
1     41
Name: count, dtype: int64


### Training and Test size as expected 

In [12]:
# Train Logistic Regression Model
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
# Make predictions on both Algorithms
y_pred_log = log_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

In [15]:
# Evaluate Performance for Logistic Regression 
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1 score:", f1_score(y_test, y_pred_log))
print("\nClassification Report:", classification_report(y_test, y_pred_log))

Accuracy: 0.8707482993197279
precision: 0.6666666666666666
Recall: 0.14634146341463414
F1 score: 0.24

Classification Report:               precision    recall  f1-score   support

           0       0.88      0.99      0.93       253
           1       0.67      0.15      0.24        41

    accuracy                           0.87       294
   macro avg       0.77      0.57      0.58       294
weighted avg       0.85      0.87      0.83       294



In [16]:
# Evaluate Performance for Random Forest
accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: ", accuracy)
print("\n Classification Report: \n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy:  0.8741496598639455

 Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       253
           1       0.75      0.15      0.24        41

    accuracy                           0.87       294
   macro avg       0.81      0.57      0.59       294
weighted avg       0.86      0.87      0.84       294



### Observations from Tree-Based Models

Both Logistic Regression and Random Forest achieved similar performance (~87% accuracy), but suffer from poor recall (0.15). The models are predicting "Stays" for most employees due to class imbalance in the dataset (83% non-attrition vs 17% attrition).

This low recall means we're missing 85% of employees who actually leave - a critical business problem where false negatives are costly.

Next, we'll evaluate KNN (a distance-based algorithm) on scaled features to see if a different approach improves attrition detection.

In [31]:
# Scale features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [32]:
# Experiment with diff k values
for k in range(1, 11):
    # initialize k-NN model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # predict on my test data
    y_pred = knn.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"k={k}, Accuracy: {accuracy:.2f}")

k=1, Accuracy: 0.82
k=2, Accuracy: 0.85
k=3, Accuracy: 0.85
k=4, Accuracy: 0.86
k=5, Accuracy: 0.86
k=6, Accuracy: 0.86
k=7, Accuracy: 0.87
k=8, Accuracy: 0.87
k=9, Accuracy: 0.87
k=10, Accuracy: 0.87


In [33]:
# Train final KNN with k=7
knn_final = KNeighborsClassifier(n_neighbors=7)
knn_final.fit(X_train, y_train)
y_pred_knn = knn_final.predict(X_test)

In [34]:
# Check full metrics
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       253
           1       0.75      0.15      0.24        41

    accuracy                           0.87       294
   macro avg       0.81      0.57      0.59       294
weighted avg       0.86      0.87      0.84       294



### Critical Issue: Class Imbalance Impact

All three models achieved similar accuracy (~87%) but suffered from the same critical flaw: **extremely poor recall (0.15)**. This means the models are only identifying 15% of employees who actually leave, missing 85% of attrition cases.

**Root Cause:** The dataset has significant class imbalance (83% non-attrition vs 17% attrition). The models learned to predict "Stays" for almost everyone because that strategy maximizes overall accuracy while ignoring the minority class.

**Business Impact:** In a real-world scenario, this is unacceptable. The cost of missing an employee who's about to leave (false negative) is much higher than incorrectly flagging someone as at-risk (false positive). We need models that can actually detect attrition, not just achieve high accuracy by predicting the majority class.

**Solution:** We'll address this imbalance through class weighting and resampling techniques to force the models to pay more attention to the minority class (employees who leave).

In [35]:
# Initialize SMOTE
smote = SMOTE(random_state=42)


In [37]:
# Resample training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [38]:
# Fit Smote For Logistic Regression
log_reg = LogisticRegression(max_iter=200, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_balanced = log_reg.predict(X_test)
log_reg_smote = LogisticRegression(max_iter=200, random_state=42)
log_reg_smote.fit(X_train_resampled, y_train_resampled)


In [39]:
# Fit Smote For Random Forest
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf_balanced = rf_model.predict(X_test)
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_resampled, y_train_resampled)


In [40]:
# Scale the resampled data for KNN
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Train KNN on SMOTE data
knn_smote = KNeighborsClassifier(n_neighbors=7)
knn_smote.fit(X_train_resampled_scaled, y_train_resampled)
y_pred_knn_smote = knn_smote.predict(X_test_scaled)

print("KNN (SMOTE):")
print(classification_report(y_test, y_pred_knn_smote))

KNN (SMOTE):
              precision    recall  f1-score   support

           0       0.92      0.70      0.80       253
           1       0.26      0.63      0.37        41

    accuracy                           0.69       294
   macro avg       0.59      0.67      0.58       294
weighted avg       0.83      0.69      0.74       294



In [41]:
# Predict with SMOTE models
y_pred_log_smote = log_reg_smote.predict(X_test)
y_pred_rf_smote = rf_smote.predict(X_test)

In [30]:
# Print all reports
print("Logistic Regression (Balanced):")
print(classification_report(y_test, y_pred_log_balanced))

print("\nRandom Forest (Balanced):")
print(classification_report(y_test, y_pred_rf_balanced))

print("\nLogistic Regression (SMOTE):")
print(classification_report(y_test, y_pred_log_smote))

print("\nRandom Forest (SMOTE):")
print(classification_report(y_test, y_pred_rf_smote))

Logistic Regression (Balanced):
              precision    recall  f1-score   support

           0       0.94      0.74      0.83       253
           1       0.32      0.73      0.44        41

    accuracy                           0.74       294
   macro avg       0.63      0.74      0.64       294
weighted avg       0.86      0.74      0.78       294


Random Forest (Balanced):
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       253
           1       0.75      0.15      0.24        41

    accuracy                           0.87       294
   macro avg       0.81      0.57      0.59       294
weighted avg       0.86      0.87      0.84       294


Logistic Regression (SMOTE):
              precision    recall  f1-score   support

           0       0.94      0.74      0.83       253
           1       0.31      0.71      0.43        41

    accuracy                           0.73       294
   macro avg       0.62      0.72      

In [None]:
# Create Model Comparison Table 
results = {
    'Model': ['Logistic Regression', 'Random Forest', 'KNN', 
              'Log Reg (Balanced)', 'RF (Balanced)', 
              'Log Reg (SMOTE)', 'RF (SMOTE)', 'KNN (SMOTE)'],
    'Accuracy': [0.87, 0.87, 0.87, 0.74, 0.87, 0.73, 0.88, 0.69],
    'Precision': [0.67, 0.75, 0.75, 0.32, 0.75, 0.31, 0.65, 0.26],
    'Recall': [0.15, 0.15, 0.15, 0.73, 0.15, 0.71, 0.27, 0.63],
    'F1-Score': [0.24, 0.24, 0.24, 0.44, 0.24, 0.43, 0.38, 0.37]
}

comparison_df = pd.DataFrame(results)
print(comparison_df)

comparison_df.to_csv(r'deployment\employee-attrition-prediction\reports\model_comparison.csv', index=False)

                 Model  Accuracy  Precision  Recall  F1-Score
0  Logistic Regression      0.87       0.67    0.15      0.24
1        Random Forest      0.87       0.75    0.15      0.24
2                  KNN      0.87       0.75    0.15      0.24
3   Log Reg (Balanced)      0.74       0.32    0.73      0.44
4        RF (Balanced)      0.87       0.75    0.15      0.24
5      Log Reg (SMOTE)      0.73       0.31    0.71      0.43
6           RF (SMOTE)      0.88       0.65    0.27      0.38
7          KNN (SMOTE)      0.69       0.26    0.63      0.37


In [None]:
# Save best model (Logistic Regression Balanced based on recall)
with open(r'deployment\employee-attrition-prediction\deployment\model.pkl', 'wb') as f:
    pickle.dump(log_reg, f)

## Conclusion and Model Selection

After extensive experimentation with multiple algorithms and class imbalance techniques, we identified **Logistic Regression with Balanced Class Weights** as the optimal model for deployment.

### Final Model Performance
- **Recall: 0.73** - Successfully identifies 73% of employees who will leave (up from 15% in baseline models)
- **Precision: 0.32** - 32% of flagged employees actually leave
- **Accuracy: 0.74** - Overall correctness
- **F1-Score: 0.44** - Balanced harmonic mean

### Why This Model?

For employee attrition prediction, **recall is the most critical metric**. Missing an employee who's about to leave (false negative) is far more costly to the business than incorrectly flagging someone as at-risk (false positive). Early intervention with at-risk employees can prevent costly turnover.

The 73% recall means HR can proactively engage with nearly three-quarters of employees considering departure, enabling:
- Targeted retention conversations
- Career development interventions
- Compensation adjustments
- Work environment improvements

While precision is lower (32%), this trade-off is acceptable given that follow-up conversations with flagged employees are relatively low-cost compared to unexpected resignations.

### Model Saved
The trained Logistic Regression model has been serialized and saved to `deployment/model.pkl` for production use.