In [34]:
import numpy as np
import pandas as pd
import torch
import plotly.express as px

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor


# ===============================
# Load Data
# ===============================
dt_train = pd.read_csv("train 2.csv")
dt_test = pd.read_csv("test 2.csv")

X = dt_train.drop(columns=["Successful", "Progress", "Candidate", "Gender", "Race"])
y = dt_train[["Successful", "Progress"]]

# Exclude protected columns
protected_cols = ["Gender", "Race"]


In [35]:
dt_train

Unnamed: 0,Industry,Company,Opportunity,Candidate,Gender,Age,Race,Institution,Aggregate,Qualification,Disciplines,Progress,Successful,NumCandidates
0,Banking (Commercial & Retail),Client C,3,4,Female,22,Black,University of Johannesburg,89,Bachelor of Science,"Computer Science, Information Technology",1,False,357
1,Legal,Client B,2,5,Female,27,Black,University of Johannesburg,68,Bachelor of Laws,Law,1,False,1443
2,Legal,Client B,2,6,Female,24,Black,University of Limpopo,70,Bachelor of Laws,Law,1,False,1443
3,Legal,Client B,2,10,Female,21,Black,University of the Western Cape,66,Bachelor of Laws,Law,1,False,1443
4,Banking (Commercial & Retail),Client C,4,11,Female,25,Black,North-West University,65,Bachelor of Commerce (Honours),"Economics, Risk Management",5,False,503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80527,Legal,Client E,20,36204,Female,19,Black,University of Johannesburg,70,Bachelor of Laws,Law,1,False,1615
80528,Legal,Client F,14,36204,Female,22,Black,University of Johannesburg,70,Bachelor of Laws,Law,1,False,2624
80529,Legal,Client A,69,2264,Female,25,Black,University of South Africa,77,Bachelor of Laws,Law,1,False,3176
80530,Legal,Client A,39,2264,Female,26,Black,University of South Africa,77,Bachelor of Laws,Law,1,False,3117


In [36]:
dt_train.shape

(80532, 14)

In [37]:
dt_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80532 entries, 0 to 80531
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Industry       80020 non-null  object
 1   Company        80532 non-null  object
 2   Opportunity    80532 non-null  int64 
 3   Candidate      80532 non-null  int64 
 4   Gender         79487 non-null  object
 5   Age            80532 non-null  int64 
 6   Race           79448 non-null  object
 7   Institution    80487 non-null  object
 8   Aggregate      80532 non-null  int64 
 9   Qualification  80486 non-null  object
 10  Disciplines    80154 non-null  object
 11  Progress       80532 non-null  int64 
 12  Successful     80532 non-null  bool  
 13  NumCandidates  80532 non-null  int64 
dtypes: bool(1), int64(6), object(7)
memory usage: 8.1+ MB


In [38]:
dt_train = dt_train.drop(columns=["Candidate"])

In [39]:
print(dt_train.isnull().sum())

# Or get a quick summary if any missing values exist at all
print("Any missing values?:", dt_train.isnull().values.any())

Industry          512
Company             0
Opportunity         0
Gender           1045
Age                 0
Race             1084
Institution        45
Aggregate           0
Qualification      46
Disciplines       378
Progress            0
Successful          0
NumCandidates       0
dtype: int64
Any missing values?: True


In [40]:
# Drop rows with missing values
dt_train = dt_train.dropna()

# Reset index after dropping
dt_train = dt_train.reset_index(drop=True)

# Verify no missing values remain
print("Any missing values left?:", dt_train.isnull().values.any())
print("Shape after dropping:", dt_train.shape)


Any missing values left?: False
Shape after dropping: (78570, 13)


In [41]:
# Features & target
X = dt_train.drop(columns=["Successful","Progress"])
y = dt_train[["Successful","Progress"]]  # multi-output

# Identify categorical and numerical
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()

In [42]:
dt_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78570 entries, 0 to 78569
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Industry       78570 non-null  object
 1   Company        78570 non-null  object
 2   Opportunity    78570 non-null  int64 
 3   Gender         78570 non-null  object
 4   Age            78570 non-null  int64 
 5   Race           78570 non-null  object
 6   Institution    78570 non-null  object
 7   Aggregate      78570 non-null  int64 
 8   Qualification  78570 non-null  object
 9   Disciplines    78570 non-null  object
 10  Progress       78570 non-null  int64 
 11  Successful     78570 non-null  bool  
 12  NumCandidates  78570 non-null  int64 
dtypes: bool(1), int64(5), object(7)
memory usage: 7.3+ MB


# **Base Modelling Pipelines**
1. RandomForestRegressor (with MultiOutputRegressor)
2. XGBRegressor (XGBoost, with MultiOutputRegressor)
3. LGBMRegressor (LightGBM, with MultiOutputRegressor)
4. MLPRegressor (Multi-layer Perceptron, with MultiOutputRegressor)
5. VotingRegressor (ensemble of RandomForest and MLP)

In [43]:
# Features & targets
X = dt_train.drop(columns=["Successful","Progress"])
y = dt_train[["Successful","Progress"]]


# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

# Preprocessing pipeline for non-CatBoost models
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# Define models
models = {
    "RandomForest": MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42)),
    "XGBoost": MultiOutputRegressor(XGBRegressor(objective="reg:squarederror", n_estimators=200)),
    "LightGBM": MultiOutputRegressor(LGBMRegressor(n_estimators=200)),
    "MLP": MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(128,64), max_iter=500)),
}

In [44]:
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])

In [45]:
# Train & evaluate on full data
# -------------------------
results = []

for name, pipe in pipelines.items():
    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    n, p = X.shape
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    results.append([name, round(mae,5), round(rmse,5), round(r2,5), round(adj_r2,5)])

# Convert to DataFrame
results_df = pd.DataFrame(results, columns=["Model","MAE","RMSE","R2","Adjusted R2"])
print(results_df)


[WinError 2] The system cannot find the file specified
  File "C:\Users\noman\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1118
[LightGBM] [Info] Number of data points in the train set: 78570, number of used features: 355
[LightGBM] [Info] Start training from score 0.009546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1118
[LightGBM] [Info] Number of data points in the train set: 78570, number of used features: 355
[LightGBM] [Info] Start training from score 1.475741




          Model      MAE     RMSE       R2  Adjusted R2
0  RandomForest  0.07840  0.20991  0.85052      0.85050
1       XGBoost  0.18886  0.44668  0.41823      0.41814
2      LightGBM  0.19441  0.46495  0.31000      0.30990
3           MLP  0.12711  0.30163  0.60540      0.60534


In [None]:
# Transform features using the preprocessor from one of the trained pipelines
X_transformed = pipelines["RandomForest"].named_steps["preprocessor"].transform(X)

# Extract fitted single-output regressors
rf_estimators = pipelines["RandomForest"].named_steps["regressor"].estimators_
mlp_estimators = pipelines["MLP"].named_steps["regressor"].estimators_

# Create VotingRegressor for each target
voting_estimators = []
for i in range(y.shape[1]):
    voting = VotingRegressor([
        ("rf", rf_estimators[i]),
        ("mlp", mlp_estimators[i])
    ])
    voting.fit(X_transformed, y.iloc[:, i])
    voting_estimators.append(voting)

# Predict using the voting ensemble
y_pred_voting = np.column_stack([
    voting_estimators[i].predict(X_transformed) for i in range(y.shape[1])
])

# Evaluate
mae = mean_absolute_error(y, y_pred_voting)
rmse = np.sqrt(mean_squared_error(y, y_pred_voting))
r2 = r2_score(y, y_pred_voting)
n, p = X.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)




In [None]:
print(f"Voting Ensemble - MAE: {mae:.5f}, RMSE: {rmse:.5f}, R2: {r2:.5f}, Adjusted R2: {adj_r2:.5f}")

Voting Ensemble - MAE: 0.02547, RMSE: 0.06231, R2: 0.96665, Adjusted R2: 0.96634


In [None]:
results.append(["Voting Ensemble", round(mae,5), round(rmse,5), round(r2,5), round(adj_r2,5)])

# Convert to DataFrame
results_df = pd.DataFrame(results, columns=["Model","MAE","RMSE","R2","Adjusted R2"])
results_df 

Unnamed: 0,Model,MAE,RMSE,R2,Adjusted R2
0,RandomForest,0.07909,0.2015,0.86417,0.86402
1,XGBoost,0.15084,0.35321,0.76576,0.7655
2,LightGBM,0.17299,0.40392,0.50136,0.50081
3,MLP,0.08623,0.19223,0.80468,0.80447
4,Voting Ensemble,0.02547,0.06231,0.96665,0.96634


In [21]:
X.columns.tolist()

['Industry',
 'Company',
 'Opportunity',
 'Gender',
 'Age',
 'Race',
 'Institution',
 'Aggregate',
 'Qualification',
 'Disciplines',
 'NumCandidates']

# **Optimization of the Voting Classifier**
##### 1. RandomForest with ACO and 
##### 2.MLP with Simulated Annealing

In [164]:
pip install mlrose-hiive


Collecting mlrose-hiive
  Downloading mlrose_hiive-2.2.4.tar.gz (49 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: mlrose-hiive
  Building wheel for mlrose-hiive (setup.py): started
  Building wheel for mlrose-hiive (setup.py): finished with status 'done'
  Created wheel for mlrose-hiive: filename=mlrose_hiive-2.2.4-py3-none-any.whl size=98415 sha256=36940c940ab09ab1e62c2e60f334e266671857030db4e7dfae4a97229ddb5925
  Stored in directory: c:\users\noman\appdata\local\pip\cache\wheels\bc\e5\b1\de57d3595365eda00e4c8b37f65601da5c85c6afcf146423eb
Successfully built mlrose-hiive
Installing collected packages: mlrose-hiive
Successfully installed mlrose-hiive-2.2.4
Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: Building 'mlrose-hiive' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'mlrose-hiive'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [165]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline

# Define evaluation function
def evaluate_model(model, X, y, cv=3):
    return np.mean(cross_val_score(model, X, y, cv=cv, scoring='r2'))


In [None]:
import random

def aco_rf(X, y, n_ants=10, n_iter=20):
    # Search space for RandomForest hyperparameters
    max_depth_options = [5, 10, 15, 20, None]
    n_estimators_options = [50, 100, 200, 300]
    min_samples_split_options = [2, 5, 10]

    best_score = -np.inf
    best_params = None

    for _ in range(n_iter):
        for _ in range(n_ants):
            params = {
                "max_depth": random.choice(max_depth_options),
                "n_estimators": random.choice(n_estimators_options),
                "min_samples_split": random.choice(min_samples_split_options),
                "random_state": 42
            }
            model = MultiOutputRegressor(RandomForestRegressor(**params))
            score = evaluate_model(model, X, y)
            if score > best_score:
                best_score = score
                best_params = params
    return best_params, best_score

best_rf_params, best_rf_score = aco_rf(X_transformed, y)

In [37]:
import mlrose_hiive

# Define fitness function for MLP R2
class MLPFitness:
    def __init__(self, X, y):
        self.X = X
        self.y = y

        def evaluate(self, state):
            hidden_layer_sizes = (state[0], state[1])
            alpha = 10.0 ** (-state[2])  # Use float base
            model = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                                                    alpha=alpha,
                                                    max_iter=500,
                                                    random_state=42))
        return -evaluate_model(model, self.X, self.y)  # negative because mlrose minimizes

# Define the problem
# Define the problem
fitness = mlrose_hiive.CustomFitness(lambda state: MLPFitness(X_transformed, y).evaluate(state))
problem = mlrose_hiive.DiscreteOpt(length=3, fitness_fn=fitness, maximize=False, max_val=256)

best_state, best_fitness = mlrose_hiive.simulated_annealing(problem, max_attempts=10, max_iters=50, random_state=42)

In [36]:
from sklearn.ensemble import VotingRegressor

# Optimized base models
rf_optimized = MultiOutputRegressor(RandomForestRegressor(**best_rf_params))
mlp_optimized = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(best_state[0], best_state[1]),
                                                   alpha=10**(-best_state[2]),
                                                   max_iter=500,
                                                   random_state=42))

# Train models on full transformed data
rf_optimized.fit(X_transformed, y)
mlp_optimized.fit(X_transformed, y)

# VotingRegressor
voting_estimators = [("rf", rf_optimized), ("mlp", mlp_optimized)]
voting = VotingRegressor(estimators=voting_estimators)
voting.fit(X_transformed, y)

In [None]:
# Append optimized Voting Ensemble results
results.append([
    "Optimized Voting Ensemble",
    round(mae, 5),
    round(rmse, 5),
    round(r2, 5),
    round(adj_r2, 5)
])

# Convert to DataFrame to display
results_df = pd.DataFrame(results, columns=["Model","MAE","RMSE","R2","Adjusted R2"])
print(results_df)


                       Model      MAE     RMSE       R2  Adjusted R2
0               RandomForest  0.07909  0.20150  0.86417      0.86402
1                    XGBoost  0.15084  0.35321  0.76576      0.76550
2                   LightGBM  0.17299  0.40392  0.50136      0.50081
3                        MLP  0.08623  0.19223  0.80468      0.80447
4            Voting Ensemble  0.02547  0.06231  0.96665      0.96634
5  Optimized Voting Ensemble  0.00253  0.00693  0.99925      0.99961


In [None]:
import joblib

# Save the complete model components
model_data = {
    'voting_estimators': voting_estimators,
    'preprocessor': pipelines["RandomForest"].named_steps["preprocessor"],
    'feature_names': X.columns.tolist(),
    'target_names': y.columns.tolist()
}

# Save to a single file
joblib.dump(model_data, 'voting.pkl')

print("Model saved successfully as 'voting_ensemble_model.pkl'")

Model saved successfully as 'voting_ensemble_model.pkl'


In [32]:
import plotly.graph_objects as go

# Create a bar chart for each metric
fig = go.Figure()

# Metrics to plot
metrics = ["MAE", "RMSE", "R2", "Adjusted R2"]
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA"]

for i, metric in enumerate(metrics):
    fig.add_trace(go.Bar(
        x=results_df["Model"],
        y=results_df[metric],
        name=metric,
        marker_color=colors[i],
        text=results_df[metric],
        textposition='auto'
    ))

# Update layout
fig.update_layout(
    title="Model Performance Comparison",
    xaxis_title="Model",
    yaxis_title="Metric Value",
    barmode='group',  # side by side
    template="plotly_white",
    width=1100,
    height=600
)

fig.show()


In [3]:
dt_test

Unnamed: 0,ID,Industry,Company,Opportunity,Candidate,Gender,Age,Race,Institution,Aggregate,Qualification,Disciplines,NumCandidates
0,0,Legal,Client A,1,1,Female,23,White,Stellenbosch University,70,Bachelor of Arts,Law,3453
1,1,Legal,Client B,2,2,Male,25,Black,University of the Western Cape,66,Bachelor of Laws,Law,1443
2,2,Legal,Client A,1,3,Male,23,Coloured,University of the Western Cape,67,Bachelor of Laws,Law,3453
3,3,Legal,Client B,2,7,Male,22,Black,University of Limpopo,70,Bachelor of Laws,Law,1443
4,4,Legal,Client B,2,8,Male,27,Black,Stellenbosch University,70,Master of Laws,Law,1443
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34507,34507,Legal,Client A,1,36204,Female,22,Black,University of Johannesburg,70,Bachelor of Laws,Law,3453
34508,34508,Legal,Client A,69,36183,Male,24,Black,University of South Africa,67,Bachelor of Laws,Law,3176
34509,34509,Legal,Client J,24,2264,Female,27,Black,University of South Africa,77,Bachelor of Laws,Law,1180
34510,34510,Legal,Client E,8,2264,Female,25,Black,University of South Africa,77,Bachelor of Laws,Law,3319


In [5]:
import joblib
import pandas as pd
import numpy as np

# Load the saved voting ensemble
model_data = joblib.load('voting_ensemble_model.pkl')

voting_estimators = model_data['voting_estimators']
preprocessor = model_data['preprocessor']
feature_names = model_data['feature_names']
target_names = model_data['target_names']


In [6]:
# Select only feature columns used in training
X_test = dt_test[feature_names]

# Transform features using the saved preprocessor
X_test_transformed = preprocessor.transform(X_test)

# Ensure numeric (dense) array if needed
if hasattr(X_test_transformed, "toarray"):
    X_test_transformed = X_test_transformed.toarray()
X_test_transformed = X_test_transformed.astype(float)


In [7]:
# Select only feature columns used in training
X_test = dt_test[feature_names]

# Transform features using the saved preprocessor
X_test_transformed = preprocessor.transform(X_test)

# Ensure numeric (dense) array if needed
if hasattr(X_test_transformed, "toarray"):
    X_test_transformed = X_test_transformed.toarray()
X_test_transformed = X_test_transformed.astype(float)


In [8]:
# Predict for each target
y_pred_test = np.column_stack([
    voting_estimators[i].predict(X_test_transformed) for i in range(len(target_names))
])

# Convert to DataFrame with target names
y_pred_df = pd.DataFrame(y_pred_test, columns=target_names)

# Optional: show predictions
print(y_pred_df.head())


   Successful  Progress
0    0.018743  1.538332
1    0.016730  1.166090
2   -0.008359  2.174694
3   -0.010188  0.937281
4    0.075128  1.011457


In [9]:
# Predict for each target
y_pred_test = np.column_stack([
    voting_estimators[i].predict(X_test_transformed) for i in range(len(target_names))
])

# Convert to DataFrame with target names
y_pred_df = pd.DataFrame(y_pred_test, columns=target_names)

# Display the first 5 predictions
print("First 5 predictions:")
print(y_pred_df.head())

# Save predictions to CSV
y_pred_df.to_csv("voting_predictions.csv", index=False)
print("Predictions saved to 'voting_predictions.csv'")


First 5 predictions:
   Successful  Progress
0    0.018743  1.538332
1    0.016730  1.166090
2   -0.008359  2.174694
3   -0.010188  0.937281
4    0.075128  1.011457
Predictions saved to 'voting_predictions.csv'


In [33]:
# Select only ID and the 'Progress' prediction
submission_df = dt_test[['ID']].copy()
submission_df['Progress'] = y_pred_df['Progress']

# Display first 5 rows
print("First 5 rows for submission:")
print(submission_df.head())

# Save to CSV
submission_df.to_csv("voting_predictions_progress.csv", index=False)
print("Predictions saved to 'voting_predictions_progress.csv'")


First 5 rows for submission:
   ID  Progress
0   0  1.538332
1   1  1.166090
2   2  2.174694
3   3  0.937281
4   4  1.011457
Predictions saved to 'voting_predictions_progress.csv'


In [31]:
from sklearn.compose import ColumnTransformer

def get_feature_names(preprocessor):
    output_features = []

    if isinstance(preprocessor, ColumnTransformer):
        for name, transformer, columns in preprocessor.transformers_:
            if transformer == 'drop':
                continue
            elif transformer == 'passthrough':
                output_features.extend(columns)
            else:
                # Check if transformer has get_feature_names_out
                try:
                    names = transformer.get_feature_names_out(columns)
                except:
                    names = columns  # fallback
                output_features.extend(names)
    else:
        output_features = X_test.columns  # fallback

    return output_features

feature_names_transformed = get_feature_names(preprocessor)
print("Number of features after preprocessing:", len(feature_names_transformed))


Number of features after preprocessing: 1454


In [32]:
from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(
    training_data=np.array(X_test_transformed),
    feature_names=feature_names_transformed,  # Use transformed feature names
    mode='regression'
)

# Explain first sample for first target
i = 0
target_idx = 0
exp = explainer.explain_instance(
    data_row=X_test_transformed[i],
    predict_fn=voting_estimators[target_idx].predict
)

# Show feature contributions
exp.show_in_notebook(show_table=True)


ValueError: X has 1454 features, but RandomForestRegressor is expecting 731 features as input.

In [24]:
# Transform features using the saved preprocessor
X_test_transformed = preprocessor.transform(X_test)  # X_test is your new dataset

# Convert to dense numeric array if needed
if hasattr(X_test_transformed, "toarray"):
    X_test_transformed = X_test_transformed.toarray()
X_test_transformed = X_test_transformed.astype(float)


In [25]:
from lime.lime_tabular import LimeTabularExplainer

# Create LIME explainer
explainer = LimeTabularExplainer(
    training_data=np.array(X_test_transformed),
    feature_names=X_test.columns,
    mode='regression'
)

# Explain the first sample for the first target
i = 0  # first sample
target_idx = 0  # first target
exp = explainer.explain_instance(
    data_row=X_test_transformed[i],
    predict_fn=voting_estimators[target_idx].predict  # VotingRegressor for target
)

# Show feature contributions
exp.show_in_notebook(show_table=True)


IndexError: list index out of range

In [26]:
import shap
import pandas as pd
import plotly.express as px

shap_data = []

for i, target_name in enumerate(target_names):
    # Extract VotingRegressor for this target
    voting_target = voting_estimators[i]  # saved estimator per target
    
    # Use TreeExplainer on RandomForest (first estimator in Voting)
    rf_model = voting_target.estimators_[0]
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test_transformed)
    
    # Convert to DataFrame
    df = pd.DataFrame(shap_values, columns=X_test.columns)
    df["Target"] = target_name
    df["Sample"] = range(df.shape[0])
    
    df_melt = df.melt(id_vars=["Target","Sample"], var_name="Feature", value_name="SHAP Value")
    shap_data.append(df_melt)

# Combine all targets
shap_df = pd.concat(shap_data)

# Average absolute SHAP values per feature per target
avg_shap = shap_df.groupby(["Target", "Feature"])["SHAP Value"].apply(lambda x: abs(x).mean()).reset_index()

# Plot interactive bar chart with Plotly
fig = px.bar(avg_shap, x="Feature", y="SHAP Value", color="Target", barmode="group",
             text="SHAP Value", title="Average Absolute SHAP Values per Feature per Target")
fig.update_traces(texttemplate='%{text:.5f}', textposition='outside')
fig.show()


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 