In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import dalex as dx
from lime.lime_tabular import LimeTabularExplainer

# Set output directory
OUTPUT_DIR = r"C:\Users\SHRI\Documents\DS\DS_Projects\Employment_Analysis\outputs\predictive_analytics"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [4]:
# Load data
data_path = r"C:\Users\SHRI\Documents\DS\DS_Projects\Employment_Analysis\data\transformed_data.csv"
df = pd.read_csv(data_path)

# Define target columns
promotion_target = "promotion"  # Create this target column if not present
performance_target = "post_promotion_performance"

# Ensure numeric encoding for categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Check for missing values
df.fillna(df.median(), inplace=True)  # Replace missing values with median for simplicity


In [5]:
# Create binary target for promotion prediction (example logic)
df[promotion_target] = (df['months_since_last_promotion'] > 12).astype(int)


In [6]:
# Define features and target for promotion prediction
X_promotion = df.drop(columns=[promotion_target, performance_target])
y_promotion = df[promotion_target]

# Train-Test split
X_train_promotion, X_test_promotion, y_train_promotion, y_test_promotion = train_test_split(
    X_promotion, y_promotion, test_size=0.2, random_state=42
)


In [7]:
# Define features and target for performance prediction
X_performance = df.drop(columns=[promotion_target, performance_target])
y_performance = df[performance_target]

# Train-Test split
X_train_performance, X_test_performance, y_train_performance, y_test_performance = train_test_split(
    X_performance, y_performance, test_size=0.2, random_state=42
)


In [8]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

# Define pipeline
def create_pipeline(model):
    return Pipeline([
        ('scaler', StandardScaler()), 
        ('classifier', model)
    ])


In [9]:
promotion_results = []

print("Evaluating models for promotion prediction...")
for model_name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train_promotion, y_train_promotion)
    
    # Predictions and evaluations
    y_pred = pipeline.predict(X_test_promotion)
    y_proba = pipeline.predict_proba(X_test_promotion)[:, 1] if hasattr(pipeline, "predict_proba") else None
    roc_auc = roc_auc_score(y_test_promotion, y_proba) if y_proba is not None else None
    report = classification_report(y_test_promotion, y_pred)
    conf_matrix = confusion_matrix(y_test_promotion, y_pred)
    
    # Save results
    promotion_results.append({
        "Model": model_name,
        "Classification Report": report,
        "Confusion Matrix": conf_matrix,
        "ROC AUC": roc_auc
    })
    
    print(f"Model: {model_name}\n{report}\n")


Evaluating models for promotion prediction...
Model: Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6132
           1       1.00      1.00      1.00      3799

    accuracy                           1.00      9931
   macro avg       1.00      1.00      1.00      9931
weighted avg       1.00      1.00      1.00      9931


Model: Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6132
           1       1.00      1.00      1.00      3799

    accuracy                           1.00      9931
   macro avg       1.00      1.00      1.00      9931
weighted avg       1.00      1.00      1.00      9931


Model: Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6132
           1       1.00      1.00      1.00      3799

    accuracy                           1.00      9931
   macro av

Parameters: { "use_label_encoder" } are not used.



Model: XGBoost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6132
           1       1.00      1.00      1.00      3799

    accuracy                           1.00      9931
   macro avg       1.00      1.00      1.00      9931
weighted avg       1.00      1.00      1.00      9931


[LightGBM] [Info] Number of positive: 14913, number of negative: 24809
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2765
[LightGBM] [Info] Number of data points in the train set: 39722, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.375434 -> initscore=-0.508973
[LightGBM] [Info] Start training from score -0.508973
Model: LightGBM
              precision    recall  f1-score   support

           0       1.00     

In [13]:
from sklearn.preprocessing import LabelEncoder

# Convert y_train_performance and y_test_performance to integer labels
label_encoder = LabelEncoder()
y_train_performance = label_encoder.fit_transform(y_train_performance)
y_test_performance = label_encoder.transform(y_test_performance)

# Convert class names to strings for classification_report
class_names = [str(class_) for class_ in label_encoder.classes_]

# Print the mapping for verification
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:", class_mapping)

# Proceed with model training
performance_results = []

print("Evaluating models for performance prediction...")
for model_name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train_performance, y_train_performance)
    
    # Predictions and evaluations
    y_pred = pipeline.predict(X_test_performance)
    y_proba = pipeline.predict_proba(X_test_performance) if hasattr(pipeline, "predict_proba") else None
    
    # Handle ROC AUC for binary and multiclass cases
    if y_proba is not None:
        if len(set(y_test_performance)) > 2:  # Multiclass case
            roc_auc = roc_auc_score(y_test_performance, y_proba, multi_class="ovr")
        else:  # Binary case
            roc_auc = roc_auc_score(y_test_performance, y_proba[:, 1])
    else:
        roc_auc = None
    
    report = classification_report(y_test_performance, y_pred, target_names=class_names)
    conf_matrix = confusion_matrix(y_test_performance, y_pred)
    
    # Save results
    performance_results.append({
        "Model": model_name,
        "Classification Report": report,
        "Confusion Matrix": conf_matrix,
        "ROC AUC": roc_auc
    })
    
    print(f"Model: {model_name}\n{report}\n")
    if roc_auc is not None:
        print(f"ROC AUC: {roc_auc}\n")


Class Mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2), np.int64(3): np.int64(3), np.int64(4): np.int64(4), np.int64(5): np.int64(5), np.int64(6): np.int64(6), np.int64(7): np.int64(7), np.int64(8): np.int64(8), np.int64(9): np.int64(9), np.int64(10): np.int64(10), np.int64(11): np.int64(11), np.int64(12): np.int64(12), np.int64(13): np.int64(13), np.int64(14): np.int64(14), np.int64(15): np.int64(15), np.int64(16): np.int64(16), np.int64(17): np.int64(17), np.int64(18): np.int64(18), np.int64(19): np.int64(19), np.int64(20): np.int64(20)}
Evaluating models for performance prediction...
Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.75      0.01      0.02       266
           1       0.48      0.95      0.63       525
           2       0.40      0.42      0.41       485
           3       0.40      0.36      0.38       489
           4       0.51      0.35      0.41       488
           5    

Parameters: { "use_label_encoder" } are not used.



Model: XGBoost
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       266
           1       0.98      0.99      0.99       525
           2       0.94      0.98      0.96       485
           3       0.91      0.90      0.91       489
           4       0.86      0.90      0.88       488
           5       0.82      0.76      0.79       463
           6       0.80      0.86      0.83       486
           7       0.84      0.83      0.84       493
           8       0.75      0.75      0.75       483
           9       0.71      0.69      0.70       513
          10       0.64      0.67      0.65       466
          11       0.70      0.59      0.64       524
          12       0.68      0.60      0.64       495
          13       0.72      0.77      0.74       457
          14       0.80      0.82      0.81       517
          15       0.87      0.87      0.87       527
          16       0.89      0.92      0.90       499
          17

In [19]:
# Import DALEX and LIME libraries
import dalex as dx
import lime
from lime.lime_tabular import LimeTabularExplainer
import os
import numpy as np
import matplotlib.pyplot as plt

# Define OUTPUT_DIR for saving explanation outputs
OUTPUT_DIR = 'C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/outputs/predictive_analytics/explanations'

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to explain models using DALEX and LIME
def explain_model(model, X_train, X_test, y_train, y_test, model_name, target="promotion"):
    # DALEX explainability
    explainer = dx.Explainer(model, X_train, y_train)  # Correct DALEX instantiation
    
    # Try generating model explainability in DALEX, with error handling
    try:
        daex_model_path = os.path.join(OUTPUT_DIR, f"{model_name}_{target}_dalex_explanation.html")
        
        # Plot and save model performance, info, and profile (DALEX)
        explainer.model_performance().plot()
        explainer.model_info().plot()
        explainer.model_profile().plot()

        # Save DALEX explanation as an HTML file
        explainer.model_performance().save(daex_model_path)
    except ZeroDivisionError:
        print(f"Warning: Zero division error occurred for model {model_name} when calculating performance metrics.")
        daex_model_path = None  # Handle the error gracefully and return None

    # LIME explainability
    lime_explainer = LimeTabularExplainer(
        X_train.values,  # The training data
        mode='classification',
        training_labels=y_train.values,
        feature_names=X_train.columns,
        class_names=[f'Not {target}', target],  # Customize class names
        discretize_continuous=True
    )
    
    # Choose a random instance for explanation
    idx = np.random.randint(0, len(X_test))
    instance = X_test.iloc[idx]
    
    # Explain the instance using LIME
    lime_explanation = lime_explainer.explain_instance(
        instance.values, model.predict_proba, num_features=10
    )
    
    # Save the LIME explanation plot
    lime_explanation_image_path = os.path.join(OUTPUT_DIR, f"{model_name}_{target}_lime_explanation_{idx}.png")
    
    # Generate and save the explanation figure using pyplot
    lime_explanation.as_pyplot_figure()
    plt.savefig(lime_explanation_image_path)
    plt.close()  # Close the plot after saving
    
    return daex_model_path, lime_explanation_image_path

# Assuming models are already defined and trained, and your data is prepared
# Loop through all models and apply DALEX and LIME explainability
explanation_results = {}

for model_name, model in models.items():
    # Skip Logistic Regression or any model with compatibility issues with DALEX
    if model_name == "Logistic Regression":
        continue

    print(f"Explaining model: {model_name}")
    
    # Explain the model using DALEX and LIME for the promotion prediction task
    daex_path, lime_path = explain_model(
        model, X_train_promotion, X_test_promotion, y_train_promotion, y_test_promotion, model_name, target="promotion"
    )
    
    # Save results
    explanation_results[model_name] = {
        "DALEX": daex_path,
        "LIME": lime_path
    }

    print(f"Explanation for {model_name} saved at:\n- DALEX: {daex_path}\n- LIME: {lime_path}")


Explaining model: Decision Tree
Preparation of a new explainer is initiated

  -> data              : 39722 rows 43 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39722 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x000001AEF30479C0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.0, max = 0.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.375, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!




Explanation for Decision Tree saved at:
- DALEX: None
- LIME: C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/outputs/predictive_analytics/explanations\Decision Tree_promotion_lime_explanation_7111.png
Explaining model: Random Forest
Preparation of a new explainer is initiated

  -> data              : 39722 rows 43 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39722 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x000001AEF30479C0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.




  -> predicted values  : min = 0.05, mean = 0.118, max = 0.17
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)




  -> residuals         : min = -0.16, mean = 0.258, max = 0.94
  -> model_info        : package sklearn

A new explainer has been created!
Explanation for Random Forest saved at:
- DALEX: None
- LIME: C:/Users/SHRI/Documents/DS/DS_Projects/Employment_Analysis/outputs/predictive_analytics/explanations\Random Forest_promotion_lime_explanation_3711.png
Explaining model: XGBoost
Preparation of a new explainer is initiated

  -> data              : 39722 rows 43 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39722 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x000001AEF30479C0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0914, mean = 0.124, max = 0.155
  -> model type  