In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
# CRITICAL IMPORT FOR SMOTE
from imblearn.over_sampling import SMOTE
import numpy as np

In [2]:
# --- 1. DATA LOADING, CLEANING, AND PREPARATION ---
# Load the uploaded dataset
data = pd.read_csv("/content/Employee_Attrition_Cleaned.csv")


In [3]:
# Data cleaning and feature engineering
data.fillna(method='ffill', inplace=True)
data['PromotionGap'] = data['YearsAtCompany'] - data['YearsSinceLastPromotion']

  data.fillna(method='ffill', inplace=True)


In [4]:
# Define Features ('X') and Target ('y')
X = data.drop('Attrition', axis=1)
# Convert 'Attrition' (Yes/No) to binary (1/0)
y = data['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

In [5]:
# Identify column types for preprocessing
# Define column types
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['float64']).columns

In [6]:
# Preprocessing Pipeline (Scaler for numeric, OneHot for categorical)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

In [7]:
# Split Data (Stratified split maintains class ratio in train/test)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [8]:
# --- 2. PREPROCESSING AND SMOTE APPLICATION ---
# Transform data (Fit on train, transform on both)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Apply SMOTE to the transformed training data ONLY to balance classes
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_transformed, y_train)

print(f"Original Training Class Distribution: \n{y_train.value_counts()}")
print(f"SMOTEd Training Class Distribution: \n{y_train_smote.value_counts()}")

Original Training Class Distribution: 
Attrition
0    986
1    190
Name: count, dtype: int64
SMOTEd Training Class Distribution: 
Attrition
0    986
1    986
Name: count, dtype: int64


In [9]:
# --- 3. MODEL TRAINING AND EVALUATION ---
# A. LOGISTIC REGRESSION WITH SMOTE
logreg_smote = LogisticRegression(max_iter=1000, random_state=42)
logreg_smote.fit(X_train_smote, y_train_smote)
y_pred_logreg = logreg_smote.predict(X_test_transformed)

print("\n\n--- LOGISTIC REGRESSION (WITH SMOTE) RESULTS ---")
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))




--- LOGISTIC REGRESSION (WITH SMOTE) RESULTS ---
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86       247
           1       0.39      0.64      0.48        47

    accuracy                           0.78       294
   macro avg       0.66      0.72      0.67       294
weighted avg       0.84      0.78      0.80       294

Confusion Matrix:
[[200  47]
 [ 17  30]]


In [10]:
# B. DECISION TREE WITH SMOTE (Hyperparameter Tuned)
dt_smote = DecisionTreeClassifier(random_state=42)
param_grid = {'max_depth': [3, 5, 10, None],
              'min_samples_split': [2, 5, 10]}

grid_search_smote = GridSearchCV(dt_smote, param_grid, cv=5, scoring='f1')
# Training GridSearchCV on the balanced, SMOTEd data
grid_search_smote.fit(X_train_smote, y_train_smote)

y_pred_dt = grid_search_smote.predict(X_test_transformed)

print("\n\n--- DECISION TREE (WITH SMOTE) RESULTS ---")
print(f"Best parameters: {grid_search_smote.best_params_}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))



--- DECISION TREE (WITH SMOTE) RESULTS ---
Best parameters: {'max_depth': 10, 'min_samples_split': 5}
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       247
           1       0.33      0.36      0.34        47

    accuracy                           0.78       294
   macro avg       0.60      0.61      0.61       294
weighted avg       0.79      0.78      0.78       294

Confusion Matrix:
[[212  35]
 [ 30  17]]
