## Notebook to benchmark Trained NN VS Classifiers

In [1]:
import torch
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score
import joblib
import sys
import numpy as np
import json
import time
import itertools
from sklearn.metrics import (
    accuracy_score, recall_score, f1_score, precision_score, roc_auc_score,
    top_k_accuracy_score
)
# Assuming these are available in your environment:
from ensembler import EnsemblerClassifier, FinalWeightGeneratorNN 
from classifiers import (
    SVMClassifier, RBFClassifier, RandomForestClassifier, NaiveBayesClassifier, 
    LogisticRegressionClassifier, LDAClassifier, KNNClassifier, DecisionTreeClassifier,
    AdaBoostClassifier, GBMClassifier, XGBoostClassifier
)

# --- Configuration ---
CLIP_FEATURES_DIR = "clip_features"
VAL = CLIP_FEATURES_DIR + "/val_features.pt"
SCALER = "scaler_model.joblib"
PCA = "pca_model.joblib"
LIME = "top_k_lime_indices.joblib"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# --- Files ---
# Ensure this file contains the results (Loop_1) from your previous run
PREVIOUS_RESULTS_FILE = "full_benchmark_results.json" 

# --- Runtime Variables ---
BENCHMARK_LOOPS = 5
N_CLASSIFIERS = 11   
N_WEIGHTED_METRICS = 7 

MAX_TIME = 21.121998  # Maximum inference time observed in previous runs

In [2]:
def load_features(file_path):
    data = torch.load(file_path)
    return data["image_features"], data["text_features"], data["filenames"], data["labels"]


# Load train and validation features
val_img_features, val_txt_features, _, val_labels = load_features(VAL)

In [3]:
# Combine image and text features for training
X_val = torch.cat((val_img_features, val_txt_features), dim=1)

# Flatten features into a 2D matrix (samples x features)
X_val = X_val.view(X_val.size(0), -1).numpy()

# Print the shape of the features
print(f"X_val shape: {X_val.shape}")

# Convert labels to NumPy arrays
y_val = val_labels.numpy()  

# Load scaler and PCA models
scaler = joblib.load(SCALER)
pca = joblib.load(PCA)
lime = joblib.load(LIME)

# Scale and transform the features
X_val_scaled = scaler.transform(X_val)
X_val_pca = pca.transform(X_val_scaled)
X_val_lime = X_val_scaled[:, lime]

X_TEST = X_val_pca
Y_TRUE = y_val

X_val shape: (1985, 1024)


In [4]:
from classifiers import (
    SVMClassifier, RBFClassifier, RandomForestClassifier, NaiveBayesClassifier, 
    LogisticRegressionClassifier, LDAClassifier, KNNClassifier, DecisionTreeClassifier,
    AdaBoostClassifier, GBMClassifier, XGBoostClassifier
)

from ensembler import EnsemblerClassifier

classifier_names = [
    "SVMClassifier", "RBFClassifier", "RandomForestClassifier",
    "NaiveBayesClassifier", "LogisticRegressionClassifier",
    "LDAClassifier", "KNNClassifier", "DecisionTreeClassifier",
    "AdaBoostClassifier", "GBMClassifier", "XGBoostClassifier"
]

# Instantiate classifiers
classifiers = [
    SVMClassifier(), RBFClassifier(), RandomForestClassifier(), NaiveBayesClassifier(),
    LogisticRegressionClassifier(), LDAClassifier(), KNNClassifier(),
    DecisionTreeClassifier(), AdaBoostClassifier(), GBMClassifier(),
    XGBoostClassifier()
]

for clf in classifiers:
    clf.load()

Loaded model from: models_pca/SVM.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/RBF.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/RandomForest.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/NaiveBayes.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/LogisticRegression.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/LDA.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/KNN.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/DecisionTree.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/AdaBoost.joblib
Loaded label encoder from: models_pca/label_encoder.joblib
Loaded model from: models_pca/GBM.joblib
Loaded label e

In [5]:
from ensembler import FinalWeightGeneratorNN

#load model
model_path = "final_weight_generator.pth"
if os.path.exists(model_path):
    model = FinalWeightGeneratorNN()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.to(DEVICE)
    print("Model loaded successfully.")

Model loaded successfully.


In [6]:
with open('ensemble_results.json', 'r') as file:
    data = json.load(file)

# Function to process each entry
def process_entry(entry):
    metrics = {
        "accuracy": entry["accuracy"],
        "recall": entry["recall"],
        "f1_score": entry["f1_score"],
        "precision": entry["precision"],
        "roc_auc": entry["roc_auc"],
        "top5_accuracy": entry["top5_accuracy"],
        "inference_time": entry["inference_time"]
    }

    weights = entry["selected_weights"]
    activations = entry["activation_bits"]

    weight_index = 0
    updated_weights = []

    for bit in activations:
        if bit > 0.5 and weight_index < len(weights):
            updated_weights.append(weights[weight_index])
            weight_index += 1
        else:
            updated_weights.append(0.0)

    return {
        "metrics": metrics,
        "updated_weights": updated_weights
    }

# Dictionary to hold results for each key
results = {}

# Loop through all keys in the file
for key, entry in data.items():
    results[key] = process_entry(entry)

# Example: print everything nicely
for key, info in results.items():
    print(f"\n--- {key.upper()} ---")
    print("Metrics:")
    for m_key, m_val in info["metrics"].items():
        print(f"  {m_key}: {m_val}")
    print("Updated Weights:")
    print(info["updated_weights"])
    

metrics = []

metric_keys = [
    "accuracy", "recall", "f1_score",
    "precision", "roc_auc", "top5_accuracy",
    "inference_time"
]

# Go through each key and extract just the metric values
for entry in data.values():
    for key in metric_keys:
        metrics.append(entry[key])

print(metrics)


--- ACC ---
Metrics:
  accuracy: 0.6493702770780856
  recall: 0.6493702770780856
  f1_score: 0.6189334693848085
  precision: 0.6173809469340443
  roc_auc: 0.9416404505495262
  top5_accuracy: 0.8997481108312343
  inference_time: 9.356147527694702
Updated Weights:
[0.07719512283802032, 0.16135337948799133, 0.06362076848745346, 0.16351158916950226, 0.17402200400829315, 0.0, 0.0, 0.0, 0.08711770921945572, 0.13290242850780487, 0.14027704298496246]

--- RECALL ---
Metrics:
  accuracy: 0.6337531486146095
  recall: 0.6337531486146095
  f1_score: 0.6142130942594919
  precision: 0.6203917490051634
  roc_auc: 0.9405445894466619
  top5_accuracy: 0.9032745591939546
  inference_time: 8.8707594871521
Updated Weights:
[0.303228497505188, 0.13721773028373718, 0.09513887763023376, 0.23675239086151123, 0.0, 0.08771230280399323, 0.0, 0.03424537926912308, 0.10570481419563293, 0.0, 0.0]

--- F1 ---
Metrics:
  accuracy: 0.6463476070528967
  recall: 0.6463476070528967
  f1_score: 0.6223835033139069
  precisi

In [7]:
import itertools

all_ensemble_combinations = []
for k in range(1, N_CLASSIFIERS + 1):
    for combination_indices in itertools.combinations(range(N_CLASSIFIERS), k):
        # Select the actual classifiers for this combination
        selected_classifiers = [classifiers[i] for i in combination_indices]
        
        # Determine the uniform weight for each selected classifier
        uniform_weight = 1.0 / k
        weights = [uniform_weight] * k
        
        ensemble_members = list(zip(selected_classifiers, weights))
        
        ensemble_name = "_".join([classifier_names[i] for i in combination_indices])
        
        ensemble_info = {
            "name": ensemble_name,
            "members": ensemble_members,
            "weights": weights,
            "is_single_clf": (k == 1) # Flag for easy identification later
        }
        
        all_ensemble_combinations.append(ensemble_info)


In [8]:
def compute_metrics(y_true, y_pred, y_proba, inference_time):
    """Computes a set of performance metrics."""
    # Note: roc_auc and top_k_accuracy require probabilities (y_proba)
    # The current EnsemblerClassifier implementation in your previous code
    # does not explicitly show a 'predict_proba' method, but we assume it exists
    # or the classifiers/ensembler return it. If not, these metrics will fail.
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, average='macro', zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average='macro', zero_division=0),
        "precision": precision_score(y_true, y_pred, average='macro', zero_division=0),
        "inference_time": inference_time
    }

    # AUC and Top-K require probability estimates
    if y_proba is not None and y_proba.ndim == 2:
        try:
            metrics["roc_auc"] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted', labels=np.arange(1, 91))
        except ValueError:
            metrics["roc_auc"] = 0.0 # Handle case where classes might be missing
        
        # Determine k for Top-K (assuming k=5 is appropriate for the number of classes)
        k_val = min(5, y_proba.shape[1])
        if k_val > 1:
            metrics["topk_accuracy"] = top_k_accuracy_score(y_true, y_proba, k=k_val, labels=np.arange(1, 91))
        else:
            metrics["topk_accuracy"] = metrics["accuracy"]

    return metrics

# Function to calculate the final weighted score based on random input parameters
def calculate_weighted_score(metrics_dict, weights):
    """
    Calculates a single weighted score for an ensemble based on the random input weights.
    The 7 weights apply to the following 7 metrics.
    """
    N_WEIGHTED_METRICS = len(weights)
    weighted_metric_keys = ["accuracy", "f1_score", "precision", "recall", "roc_auc", "topk_accuracy", "inference_time"]
    weighted_metric_keys = weighted_metric_keys[:N_WEIGHTED_METRICS]

    metric_values = []
    for key in weighted_metric_keys:
        val = metrics_dict.get(key)
        if val is None:
            val = 0.0
            if key == "inference_time": 
                val = 1.0 
        if key == "inference_time":
            val = 1 - (val / MAX_TIME)        
        metric_values.append(val)
    
    # Simple dot product for weighted sum/average of metrics
    weighted_sum = np.dot(np.array(metric_values), weights)
    
    return float(weighted_sum)

In [9]:
# --- 1. Load Previous Results and Extract Fixed Baselines ---
try:
    with open(PREVIOUS_RESULTS_FILE, 'r') as f:
        FINAL_RESULTS = json.load(f)
        loop_1_key = sorted(FINAL_RESULTS.keys())[0] # Assumes Loop_1 is the first key
        loop_1_results = FINAL_RESULTS[loop_1_key]
        print(f"Successfully loaded previous results from {PREVIOUS_RESULTS_FILE} (Using {loop_1_key} for uniform ensemble data).")
        
except FileNotFoundError:
    raise FileNotFoundError(f"Previous results file '{PREVIOUS_RESULTS_FILE}' not found. Cannot proceed.")


# All loops must be run to get 5 NN results
START_LOOP_INDEX = 1 

# The list of all uniform ensemble names for re-scoring
uniform_ensemble_names = [name for name, entry in loop_1_results.items() if entry.get("type") == "Uniform_Ensemble"]

Successfully loaded previous results from full_benchmark_results.json (Using Loop_1 for uniform ensemble data).


In [10]:
for loop_i in range(START_LOOP_INDEX, BENCHMARK_LOOPS + 1):
    print(f"\n--- Starting Benchmark Loop {loop_i}/{BENCHMARK_LOOPS} ---")
    loop_key = f"Loop_{loop_i}"
    
    # 1. Generate random input parameters (weights)
    input_params_cpu = np.random.rand(1, N_WEIGHTED_METRICS).astype(np.float32)
    input_weights = input_params_cpu / input_params_cpu.sum()
    input_params = torch.tensor(input_weights, device=DEVICE)
    
    # 2. Score Static Uniform Combinations (NO METRICS RECALCULATION)
    if loop_i > 1:
        # Clone and re-score for Loops 2-5
        FINAL_RESULTS[loop_key] = {}
        
        print("Re-scoring uniform ensembles with new random weights...")
        for name in uniform_ensemble_names:
            entry = loop_1_results[name] # Use the fixed metrics from Loop 1
            
            # Calculate the NEW weighted score using the NEW random weights 
            new_weighted_score = calculate_weighted_score(entry["metrics"], input_weights.flatten())
            
            # Clone the entry but update the weighted score and the weights_applied
            new_entry = entry.copy()
            new_entry["weighted_score"] = new_weighted_score
            new_entry["weights_applied"] = input_weights.flatten().tolist()
            
            FINAL_RESULTS[loop_key][name] = new_entry
        
    else: # Loop 1
         # Ensure the loaded Loop 1 entry is updated/available for NN results
         FINAL_RESULTS.setdefault(loop_key, loop_1_results)


    # 3. Benchmarking the FinalWeightGeneratorNN Ensemble 
    
    metrics_tensor = torch.tensor(metrics, dtype=torch.float32, device=DEVICE).unsqueeze(0)

    input_tensor = torch.cat((input_params, metrics_tensor), dim=1)  # Concatenate along the feature dimension
    
    
    print(f"\nGenerating and benchmarking NN ensemble for loop {loop_i}...")

    with torch.no_grad():
        start_time_nn = time.time()
        activation, weights_raw = model(input_tensor)
        nn_gen_time = time.time() - start_time_nn
    
    # Process NN Output to create the ensemble
    activation_bits = activation.detach().cpu().numpy().flatten()
    weight_values = weights_raw.detach().cpu().numpy().flatten()
        
    activated_indices = np.where(activation_bits > 0.5)[0]
    if len(activated_indices) == 0:
        activated_indices = [np.argmax(activation_bits)]
    
    selected_classifiers = [classifiers[i] for i in activated_indices]
    selected_weights = [weight_values[i] for i in activated_indices]

    # Normalize weights
    selected_weights = np.array(selected_weights)
    if selected_weights.sum() > 0:
        selected_weights /= selected_weights.sum()
    else:
        selected_weights = np.ones_like(selected_weights) / len(selected_weights)

    # Create NN-based ensemble
    nn_ensemble_members = list(zip(selected_classifiers, selected_weights))
    nn_ensemble = EnsemblerClassifier(nn_ensemble_members)

    # Benchmark NN-based ensemble
    start_time = time.time()
    y_pred_nn = nn_ensemble.classify(X_TEST)
    y_proba_nn = nn_ensemble.classify_proba(X_TEST)
    inference_time = time.time() - start_time
    
    metrics_nn = compute_metrics(Y_TRUE, y_pred_nn, y_proba_nn, inference_time)
    weighted_score_nn = calculate_weighted_score(metrics_nn, input_weights.flatten())
    
    nn_name = "NN_Generated_Ensemble"
    FINAL_RESULTS[loop_key][nn_name] = {
        "type": "NN_Generated_Ensemble",
        "weights_applied": input_weights.flatten().tolist(),
        "input_metric_features": metrics_tensor.tolist(),
        "selected_classifiers": [type(clf).__name__ for clf in selected_classifiers],
        "selected_weights": selected_weights.tolist(),
        "nn_generation_time": nn_gen_time,
        "metrics": metrics_nn,
        "weighted_score": weighted_score_nn
    }
    
    print(f"NN Ensemble Weighted Score: {weighted_score_nn:.4f}")


--- Starting Benchmark Loop 1/5 ---

Generating and benchmarking NN ensemble for loop 1...




NN Ensemble Weighted Score: 0.5014

--- Starting Benchmark Loop 2/5 ---
Re-scoring uniform ensembles with new random weights...

Generating and benchmarking NN ensemble for loop 2...




NN Ensemble Weighted Score: 0.7009

--- Starting Benchmark Loop 3/5 ---
Re-scoring uniform ensembles with new random weights...

Generating and benchmarking NN ensemble for loop 3...




NN Ensemble Weighted Score: 0.6491

--- Starting Benchmark Loop 4/5 ---
Re-scoring uniform ensembles with new random weights...

Generating and benchmarking NN ensemble for loop 4...




NN Ensemble Weighted Score: 0.6683

--- Starting Benchmark Loop 5/5 ---
Re-scoring uniform ensembles with new random weights...

Generating and benchmarking NN ensemble for loop 5...
NN Ensemble Weighted Score: 0.6826




In [11]:
# --- 5. Save Results ---
output_file = "full_benchmark_results.json"
with open(output_file, 'w') as f:
    json.dump(FINAL_RESULTS, f, indent=4)

print(f"\n✅ Benchmarking complete. Results saved to {output_file}")


✅ Benchmarking complete. Results saved to full_benchmark_results.json


In [12]:
AVERAGE_RESULTS = {}

# Ensure loop_1_key is defined for structure reference
loop_1_key = sorted(FINAL_RESULTS.keys())[0]

# Get all unique ensemble names
all_ensemble_names = set()
for loop_key in FINAL_RESULTS.keys():
    all_ensemble_names.update(FINAL_RESULTS[loop_key].keys())

# Define the metrics to average 
ref_ensemble_key = next((k for k, v in FINAL_RESULTS[loop_1_key].items() if v.get("type") == "Uniform_Ensemble"), None)
if ref_ensemble_key is None:
    ref_ensemble_key = "NN_Generated_Ensemble"

metric_keys_to_average = list(FINAL_RESULTS[loop_1_key][ref_ensemble_key]["metrics"].keys())
metric_keys_to_average.append("weighted_score")


for name in sorted(list(all_ensemble_names)):
    scores = {key: [] for key in metric_keys_to_average}
    
    # Collect scores across all loops
    for loop_key in [f"Loop_{i}" for i in range(1, BENCHMARK_LOOPS + 1)]:
        if loop_key in FINAL_RESULTS and name in FINAL_RESULTS[loop_key]:
            entry = FINAL_RESULTS[loop_key][name]
            
            # Collect standard metrics
            for key in metric_keys_to_average[:-1]:
                scores[key].append(entry["metrics"].get(key, 0.0))
            
            # Collect weighted score
            scores["weighted_score"].append(entry["weighted_score"])

    # Calculate average scores
    avg_metrics = {}
    for key, values in scores.items():
        if values:
            avg_metrics[f"avg_{key}"] = np.mean(values) 
        else:
            avg_metrics[f"avg_{key}"] = 0.0

    AVERAGE_RESULTS[name] = avg_metrics
    
# Sort the results by the average weighted score (descending)
sorted_avg_results = sorted(AVERAGE_RESULTS.items(), key=lambda item: item[1]["avg_weighted_score"], reverse=True)

In [13]:
average_output_file = "average_benchmark_summary_self_contained.json"
with open(average_output_file, 'w') as f:
    json.dump(AVERAGE_RESULTS, f, indent=4)
    
print(f"\n✅ Average scores summary saved to {average_output_file}")


✅ Average scores summary saved to average_benchmark_summary_self_contained.json


In [14]:
# --- 6. Print Summary ---
print("\n--- FINAL AVERAGE SCORES ACROSS ALL 5 LOOPS (Sorted by Weighted Score) ---")
print("-" * 75)
print(f"{'Ensemble':<40}{'Avg. Weighted Score'}")
print("-" * 75)

for name, metrics in sorted_avg_results:
    if name == "NN_Generated_Ensemble":
        print(f"\033[1m{name:<40}{metrics['avg_weighted_score']:.4f} <--- NN\033[0m")
    else:
        print(f"{name:<40}{metrics['avg_weighted_score']:.4f}")


--- FINAL AVERAGE SCORES ACROSS ALL 5 LOOPS (Sorted by Weighted Score) ---
---------------------------------------------------------------------------
Ensemble                                Avg. Weighted Score
---------------------------------------------------------------------------
[1mNN_Generated_Ensemble                   0.6405 <--- NN[0m
LogisticRegressionClassifier_LDAClassifier0.5581
LogisticRegressionClassifier_LDAClassifier_AdaBoostClassifier0.5579
RandomForestClassifier_LogisticRegressionClassifier_LDAClassifier_XGBoostClassifier0.5568
RandomForestClassifier_LogisticRegressionClassifier_LDAClassifier_AdaBoostClassifier_XGBoostClassifier0.5568
RandomForestClassifier_LogisticRegressionClassifier_LDAClassifier0.5553
RandomForestClassifier_LogisticRegressionClassifier_LDAClassifier_AdaBoostClassifier0.5548
RandomForestClassifier_LogisticRegressionClassifier_LDAClassifier_KNNClassifier0.5544
LogisticRegressionClassifier_LDAClassifier_XGBoostClassifier0.5543
LogisticRegressio