In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define paths
MODEL_SELECTION_OUTPUT_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/outputs/model_selection'

# File paths for datasets
TRAIN_FEATURES_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_features.csv'
TEST_FEATURES_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_features.csv'
TRAIN_TARGET_ATTRITION_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_target_attrition.csv'
TEST_TARGET_ATTRITION_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_target_attrition.csv'
TRAIN_TARGET_STATUS_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_target_status.csv'
TEST_TARGET_STATUS_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_target_status.csv'

# Pickle files for preprocessing
SCALER_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/models/scaler.pkl'
LABEL_ENCODER_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/models/label_encoder.pkl'

# Load scaler and label encoder
scaler = joblib.load(SCALER_PATH)
label_encoder = joblib.load(LABEL_ENCODER_PATH)


In [4]:
# Load datasets
X_train = pd.read_csv(TRAIN_FEATURES_PATH)
X_test = pd.read_csv(TEST_FEATURES_PATH)
y_train_attrition = pd.read_csv(TRAIN_TARGET_ATTRITION_PATH).squeeze()  # Squeeze to 1D array
y_test_attrition = pd.read_csv(TEST_TARGET_ATTRITION_PATH).squeeze()
y_train_status = pd.read_csv(TRAIN_TARGET_STATUS_PATH).squeeze()
y_test_status = pd.read_csv(TEST_TARGET_STATUS_PATH).squeeze()

# Confirm data shapes
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train_attrition shape: {y_train_attrition.shape}, y_test_attrition shape: {y_test_attrition.shape}")
print(f"y_train_status shape: {y_train_status.shape}, y_test_status shape: {y_test_status.shape}")


X_train shape: (39722, 42), X_test shape: (9931, 42)
y_train_attrition shape: (39722,), y_test_attrition shape: (9931,)
y_train_status shape: (39722,), y_test_status shape: (9931,)


In [5]:
# Initialize models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'XGBoost': GradientBoostingClassifier(random_state=42),  # XGBoost equivalent for scikit-learn
    'LightGBM': GradientBoostingClassifier(random_state=42)  # LightGBM equivalent for scikit-learn
}

# Parameters for hyperparameter tuning
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}


In [6]:
def evaluate_model(model_name, model, X_train, y_train, X_test, y_test, param_grid=None):
    """Train, tune, and evaluate the model."""
    # Hyperparameter tuning
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best score for {model_name}: {grid_search.best_score_}")
    else:
        best_model = model.fit(X_train, y_train)

    # Predictions
    y_pred = best_model.predict(X_test)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1], multi_class='ovr') if hasattr(best_model, "predict_proba") else "N/A"
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'ROC AUC Score': roc_auc,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_rep
    }

    # Save confusion matrix as image
    os.makedirs(MODEL_SELECTION_OUTPUT_PATH, exist_ok=True)
    conf_matrix_df = pd.DataFrame(conf_matrix, index=np.unique(y_test), columns=np.unique(y_test))
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix_df, annot=True, fmt='g', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(MODEL_SELECTION_OUTPUT_PATH + f'/{model_name}_confusion_matrix.png')
    plt.close()

    return result


In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

# Paths to the data files
TRAIN_FEATURES_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_features.csv'
TEST_FEATURES_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_features.csv'
TRAIN_TARGET_ATTRITION_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_target_attrition.csv'
TEST_TARGET_ATTRITION_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_target_attrition.csv'
TRAIN_TARGET_STATUS_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/train_target_status.csv'
TEST_TARGET_STATUS_PATH = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/data/test_target_status.csv'

# Load datasets
X_train = pd.read_csv(TRAIN_FEATURES_PATH)
X_test = pd.read_csv(TEST_FEATURES_PATH)
y_train_attrition = pd.read_csv(TRAIN_TARGET_ATTRITION_PATH).squeeze()  # Squeeze to 1D array
y_test_attrition = pd.read_csv(TEST_TARGET_ATTRITION_PATH).squeeze()
y_train_status = pd.read_csv(TRAIN_TARGET_STATUS_PATH).squeeze()
y_test_status = pd.read_csv(TEST_TARGET_STATUS_PATH).squeeze()

# Identify numerical and categorical columns
numerical_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Instantiate the scaler and fit it on the train data for numerical columns
scaler = MinMaxScaler()
scaler.fit(X_train[numerical_columns])

print(f"Numerical features after scaling: {numerical_columns}")

# Apply scaling to both train and test data for numerical columns
X_train[numerical_columns] = scaler.transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


# Initialize LabelEncoders for each categorical column
label_encoders = {col: LabelEncoder() for col in categorical_columns}

# Fit encoders on training data and transform both train and test data
for col in categorical_columns:
    # Fit the encoder on training data
    label_encoders[col].fit(X_train[col].astype(str))
    
    # Transform training data
    X_train[col] = label_encoders[col].transform(X_train[col].astype(str))
    
    # Transform test data with a fallback for unseen labels
    X_test[col] = X_test[col].apply(
        lambda x: label_encoders[col].transform([x])[0] 
        if x in label_encoders[col].classes_ else -1
    )

print(f"Categorical features after encoding: {categorical_columns}")

# Initialize results lists
results_attrition = []
results_status = []

# Evaluate models for ATTRITION target
print("Evaluating models for ATTRITION...")
for model_name, model in models.items():
    results_attrition.append(evaluate_model(
        model_name, model, X_train, y_train_attrition, X_test, y_test_attrition, param_grids.get(model_name)
    ))

# Evaluate models for STATUS target
print("Evaluating models for STATUS...")
for model_name, model in models.items():
    results_status.append(evaluate_model(
        model_name, model, X_train, y_train_status, X_test, y_test_status, param_grids.get(model_name)
    ))


Numerical features after scaling: Index(['EmployeeID', 'age', 'length_of_service', 'department_name',
       'job_title', 'store_name', 'gender_full', 'STATUS_YEAR',
       'GENERAL APPEARANCE', 'MANNER OF SPEAKING', 'PHYSICAL CONDITION',
       'MENTAL ALERTNESS', 'SELF-CONFIDENCE', 'ABILITY TO PRESENT IDEAS',
       'COMMUNICATION SKILLS', 'Student Performance Rating', 'salary',
       'performance_score', 'manager_rating', 'self_rating',
       'work_life_balance_score', 'overtime_hours', 'working_hours',
       'employee_satisfaction_score', 'salary_hike_percent',
       'post_promotion_performance', 'peer_feedback_score', 'absenteeism_rate',
       'tenure_bucket', 'months_since_last_promotion', 'turnover_risk_index',
       'engagement_index', 'absenteeism_category', 'performance_improvement'],
      dtype='object')
Categorical features after encoding: Index(['orighiredate_key', 'terminationdate_key', 'city_name',
       'termreason_desc', 'termtype_desc', 'BUSINESS_UNIT', 'CLASS

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for DecisionTree: {'max_depth': 10, 'min_samples_split': 5}
Best score for DecisionTree: 0.788882831171251
Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best score for XGBoost: 0.7980967718016043


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for LightGBM: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best score for LightGBM: 0.7980967718016043
Evaluating models for STATUS...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Best score for RandomForest: 0.7275817126430008
Best parameters for DecisionTree: {'max_depth': 10, 'min_samples_split': 2}
Best score for DecisionTree: 0.7175367883554928
Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best score for XGBoost: 0.7275817126430008
Best parameters for LightGBM: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best score for LightGBM: 0.7275817126430008


In [8]:
def save_classification_report(output_path, report, model_name, target_variable):
    """Save classification report to a text file."""
    os.makedirs(output_path, exist_ok=True)
    report_file = os.path.join(output_path, f"{model_name}_{target_variable}_classification_report.txt")
    with open(report_file, "w") as file:
        file.write(report)
    print(f"Classification report saved for {model_name} ({target_variable}).")

def save_confusion_matrix(matrix, model_name, target_variable, output_path):
    """Save confusion matrix as CSV."""
    os.makedirs(output_path, exist_ok=True)
    conf_matrix_file = os.path.join(output_path, f"{model_name}_{target_variable}_confusion_matrix.csv")
    matrix.to_csv(conf_matrix_file, index=True)
    print(f"Confusion matrix saved for {model_name} ({target_variable}).")


In [9]:
# Save results for ATTRITION
print("Saving results for ATTRITION...")
for result in results_attrition:
    model_name = result['Model']
    # Save classification report
    save_classification_report(MODEL_SELECTION_OUTPUT_PATH, result['Classification Report'], model_name, "Attrition")
    
    # Save confusion matrix as CSV
    conf_matrix_df = pd.DataFrame(
        result['Confusion Matrix'], 
        index=label_encoder.classes_, 
        columns=label_encoder.classes_
    )
    save_confusion_matrix(conf_matrix_df, model_name, "Attrition", MODEL_SELECTION_OUTPUT_PATH)

# Save results for STATUS
print("Saving results for STATUS...")
for result in results_status:
    model_name = result['Model']
    # Save classification report
    save_classification_report(MODEL_SELECTION_OUTPUT_PATH, result['Classification Report'], model_name, "Status")
    
    # Save confusion matrix as CSV
    conf_matrix_df = pd.DataFrame(
        result['Confusion Matrix'], 
        index=label_encoder.classes_, 
        columns=label_encoder.classes_
    )
    save_confusion_matrix(conf_matrix_df, model_name, "Status", MODEL_SELECTION_OUTPUT_PATH)


Saving results for ATTRITION...
Classification report saved for RandomForest (Attrition).
Confusion matrix saved for RandomForest (Attrition).
Classification report saved for DecisionTree (Attrition).
Confusion matrix saved for DecisionTree (Attrition).
Classification report saved for XGBoost (Attrition).
Confusion matrix saved for XGBoost (Attrition).
Classification report saved for LightGBM (Attrition).
Confusion matrix saved for LightGBM (Attrition).
Saving results for STATUS...
Classification report saved for RandomForest (Status).
Confusion matrix saved for RandomForest (Status).
Classification report saved for DecisionTree (Status).
Confusion matrix saved for DecisionTree (Status).
Classification report saved for XGBoost (Status).
Confusion matrix saved for XGBoost (Status).
Classification report saved for LightGBM (Status).
Confusion matrix saved for LightGBM (Status).


In [10]:
# Convert results to DataFrames for export
attrition_performance_df = pd.DataFrame(results_attrition).drop(columns=['Confusion Matrix', 'Classification Report'])
status_performance_df = pd.DataFrame(results_status).drop(columns=['Confusion Matrix', 'Classification Report'])

# Save model performances to CSV
attrition_performance_file = os.path.join(MODEL_SELECTION_OUTPUT_PATH, "attrition_model_performance_comparison.csv")
status_performance_file = os.path.join(MODEL_SELECTION_OUTPUT_PATH, "status_model_performance_comparison.csv")

attrition_performance_df.to_csv(attrition_performance_file, index=False)
status_performance_df.to_csv(status_performance_file, index=False)

print("Model performance comparison saved for both targets.")


Model performance comparison saved for both targets.


In [11]:
def save_roc_auc_plot(model, X_test, y_test, model_name, target_variable, output_path):
    """Save ROC AUC curve as an image."""
    if not hasattr(model, "predict_proba"):
        print(f"Model {model_name} does not support predict_proba. Skipping ROC AUC plot.")
        return
    
    from sklearn.metrics import roc_curve, auc
    
    # Compute ROC curve and AUC
    probas = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, probas)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.figure(figsize=(10, 7))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC AUC - {model_name} ({target_variable})')
    plt.legend(loc="lower right")
    
    # Save plot
    os.makedirs(output_path, exist_ok=True)
    plot_file = os.path.join(output_path, f"{model_name}_{target_variable}_roc_auc.png")
    plt.savefig(plot_file)
    plt.close()
    print(f"ROC AUC plot saved for {model_name} ({target_variable}).")


In [12]:
def save_roc_auc_plot(model, X_test, y_test, model_name, target_variable, output_path):
    """
    Save ROC AUC curve as an image.
    
    Parameters:
        model: Trained model.
        X_test: Test feature data.
        y_test: True labels for test data.
        model_name: Name of the model.
        target_variable: Target variable being evaluated (e.g., "Attrition" or "Status").
        output_path: Directory where the plot will be saved.
    """
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt
    import os

    if not hasattr(model, "predict_proba"):
        print(f"Model {model_name} does not support predict_proba. Skipping ROC AUC plot.")
        return

    try:
        # Compute probabilities and ROC curve
        probas = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, probas)
        roc_auc = auc(fpr, tpr)

        # Plot ROC curve
        plt.figure(figsize=(10, 7))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC AUC - {model_name} ({target_variable})')
        plt.legend(loc="lower right")

        # Save plot
        os.makedirs(output_path, exist_ok=True)
        plot_file = os.path.join(output_path, f"{model_name}_{target_variable}_roc_auc.png")
        plt.savefig(plot_file)
        plt.close()
        print(f"ROC AUC plot saved for {model_name} ({target_variable}) at {plot_file}.")
    except Exception as e:
        print(f"Error generating ROC AUC plot for {model_name} ({target_variable}): {e}")


# Save ROC AUC plots for ATTRITION
print("Generating ROC AUC plots for ATTRITION...")
for result in results_attrition:
    model_name = result.get('Model')
    model = result.get('Trained Model')  # Retrieve the trained model safely
    if model and model_name:
        save_roc_auc_plot(model, X_test, y_test_attrition, model_name, "Attrition", MODEL_SELECTION_OUTPUT_PATH)
    else:
        print(f"Skipping model due to missing information in results_attrition: {result}")

# Save ROC AUC plots for STATUS
print("Generating ROC AUC plots for STATUS...")
for result in results_status:
    model_name = result.get('Model')
    model = result.get('Trained Model')  # Retrieve the trained model safely
    if model and model_name:
        save_roc_auc_plot(model, X_test, y_test_status, model_name, "Status", MODEL_SELECTION_OUTPUT_PATH)
    else:
        print(f"Skipping model due to missing information in results_status: {result}")


Generating ROC AUC plots for ATTRITION...
Skipping model due to missing information in results_attrition: {'Model': 'RandomForest', 'Accuracy': 0.798106937871312, 'Precision': np.float64(0.6369746842783223), 'ROC AUC Score': np.float64(0.48909661878611566), 'Confusion Matrix': array([[7926,    0],
       [2005,    0]]), 'Classification Report': '              precision    recall  f1-score   support\n\n         0.0       0.80      1.00      0.89      7926\n         1.0       0.00      0.00      0.00      2005\n\n    accuracy                           0.80      9931\n   macro avg       0.40      0.50      0.44      9931\nweighted avg       0.64      0.80      0.71      9931\n'}
Skipping model due to missing information in results_attrition: {'Model': 'DecisionTree', 'Accuracy': 0.7869298157285268, 'Precision': np.float64(0.6720246009661082), 'ROC AUC Score': np.float64(0.501584670672549), 'Confusion Matrix': array([[7785,  141],
       [1975,   30]]), 'Classification Report': '          

In [15]:
# Fit and save y_true and y_pred for Attrition
print("Saving y_true and y_pred for Attrition...")
for result in results_attrition:  # Use results from earlier evaluation
    model_name = result['Model']
    model = models[model_name]  # Retrieve the model instance
    y_true = y_test_attrition  # Actual target values for Attrition
    # Ensure the model is fitted by evaluating it on Attrition data
    model.fit(X_train, y_train_attrition)
    y_pred = model.predict(X_test)  # Predictions
    save_y_true_y_pred_combined_and_individual(y_true, y_pred, model_name, "Attrition", MODEL_SELECTION_OUTPUT_PATH)

# Fit and save y_true and y_pred for Status
print("Saving y_true and y_pred for Status...")
for result in results_status:  # Use results from earlier evaluation
    model_name = result['Model']
    model = models[model_name]  # Retrieve the model instance
    y_true = y_test_status  # Actual target values for Status
    # Ensure the model is fitted by evaluating it on Status data
    model.fit(X_train, y_train_status)
    y_pred = model.predict(X_test)  # Predictions
    save_y_true_y_pred_combined_and_individual(y_true, y_pred, model_name, "Status", MODEL_SELECTION_OUTPUT_PATH)


Saving y_true and y_pred for Attrition...
Combined (y_true, y_pred) saved for RandomForest (Attrition).
y_true saved for RandomForest (Attrition).
y_pred saved for RandomForest (Attrition).
Combined (y_true, y_pred) saved for DecisionTree (Attrition).
y_true saved for DecisionTree (Attrition).
y_pred saved for DecisionTree (Attrition).
Combined (y_true, y_pred) saved for XGBoost (Attrition).
y_true saved for XGBoost (Attrition).
y_pred saved for XGBoost (Attrition).
Combined (y_true, y_pred) saved for LightGBM (Attrition).
y_true saved for LightGBM (Attrition).
y_pred saved for LightGBM (Attrition).
Saving y_true and y_pred for Status...
Combined (y_true, y_pred) saved for RandomForest (Status).
y_true saved for RandomForest (Status).
y_pred saved for RandomForest (Status).
Combined (y_true, y_pred) saved for DecisionTree (Status).
y_true saved for DecisionTree (Status).
y_pred saved for DecisionTree (Status).
Combined (y_true, y_pred) saved for XGBoost (Status).
y_true saved for XGBoo