In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import scienceplots
plt.style.use(['science', 'ieee'])
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration
RESULTS_DIR = Path('results')
METHODS = [
    'LASSO_CV',

    'LassoNet',
    'LASSO_QUT',
    'HarderLASSO_QUT',

    'RandomForest',
    'XGBoost'
]

In [2]:
def parse_features(feature_string):
    """Parse comma-separated feature string into list of integers."""
    if pd.isna(feature_string) or feature_string == '':
        return []
    return [int(float(x.strip())) for x in str(feature_string).split(',')]

def load_simulation_data(results_dir=RESULTS_DIR):
    """Load all simulation results, tagging rows with their dataset name."""
    frames = []

    for file_path in glob.glob(os.path.join(results_dir, "*.csv")):
        df_results = pd.read_csv(file_path)
        dataset = os.path.basename(file_path).replace(".csv", "").replace("_results", "")
        df_results = df_results.copy()
        df_results["dataset"] = dataset
        frames.append(df_results)

    results_df = pd.concat(frames, ignore_index=True)

    return results_df

# Load data
print("Loading simulation data...")
results_df = load_simulation_data()
print(f"Loaded {len(results_df)} result rows across {len(results_df['dataset'].unique())} datasets")

Loading simulation data...
Loaded 700 result rows across 14 datasets


In [3]:
# Parse selected features for each method
print("Parsing selected features...")
for method in METHODS:
    feature_col = f'{method}_features'
    if feature_col in results_df.columns:
        results_df[f'{method}_features'] = results_df[feature_col].apply(parse_features)
        print(f"✓ {method}")

Parsing selected features...
✓ LASSO_CV
✓ LassoNet
✓ LASSO_QUT
✓ HarderLASSO_QUT
✓ RandomForest
✓ XGBoost


In [4]:
def calculate_metrics_for_method(results_df, method):
    """Calculate all metrics for a given method."""
    metrics = []

    for dataset in results_df['dataset'].unique():
        dataset_results = results_df[results_df['dataset'] == dataset].copy()
        dataset_feature_count = pd.read_csv("datasets/" + dataset + ".csv").shape[1] - 1

        # Calculate metrics for each simulation
        sim_metrics = []
        for _, row in dataset_results.iterrows():
            selected = row[f'{method}_features']
            acc = row[f'{method}_acc']

            sim_metrics.append({
                'simulation': row['simulation'],
                'method': method,
                'acc': acc,
                'n_selected': len(selected),
                'frac_selected': len(selected) / dataset_feature_count,
                'dataset': row['dataset'],
            })

        metrics.extend(sim_metrics)

    return pd.DataFrame(metrics)

# Calculate metrics for all methods
print("Calculating metrics for all methods...")
all_metrics = []

for method in METHODS:
    print(f"Processing {method}...", end=' ')
    method_metrics = calculate_metrics_for_method(results_df, method)
    all_metrics.append(method_metrics)
    print(f"✓ ({len(method_metrics)} records)")

# Combine all metrics
metrics_df = pd.concat(all_metrics, ignore_index=True)
print(f"\nTotal metrics calculated: {len(metrics_df)} records")

Calculating metrics for all methods...
Processing LASSO_CV... ✓ (700 records)
Processing LassoNet... ✓ (700 records)
Processing LASSO_QUT... ✓ (700 records)
Processing HarderLASSO_QUT... ✓ (700 records)
Processing RandomForest... ✓ (700 records)
Processing XGBoost... ✓ (700 records)

Total metrics calculated: 4200 records


In [5]:
# Calculate summary statistics
print("Calculating summary statistics...")
summary_df = (
    metrics_df
    .groupby(['dataset', 'method'])
    .agg(
        acc_mean=('acc', 'mean'),
        acc_std=('acc', 'std'),
        nsel_mean=('n_selected', 'mean'),
        nsel_std=('n_selected', 'std'),
        fracsel_mean=('frac_selected', 'mean'),
        fracsel_std=('frac_selected', 'std')
    )
    .reset_index()
)
print(summary_df.head())

Calculating summary statistics...
  dataset           method  acc_mean   acc_std  nsel_mean  nsel_std  \
0    aids  HarderLASSO_QUT  0.880926  0.017047       3.16  0.618095   
1    aids         LASSO_CV  0.864095  0.009739      21.24  1.505907   
2    aids        LASSO_QUT  0.864208  0.023182       2.80  0.925820   
3    aids         LassoNet  0.876522  0.011751      22.12  2.959868   
4    aids     RandomForest  0.848163  0.014867       4.20  0.782461   

   fracsel_mean  fracsel_std  
0      0.137391     0.026874  
1      0.923478     0.065474  
2      0.121739     0.040253  
3      0.961739     0.128690  
4      0.182609     0.034020  


In [6]:
for dataset in summary_df['dataset'].unique():
    df_subset = summary_df[summary_df['dataset'] == dataset].set_index('method')

    # Accuracy as percentage with mean ± std
    acc_report = (
        (df_subset['acc_mean'] * 100).round(2).astype(str)
        + " ± " +
        (df_subset['acc_std'] * 100).round(2).astype(str)
    )

    # n_selected stays absolute count
    nsel_report = (
        df_subset['nsel_mean'].round(2).astype(str)
        + " ± " +
        df_subset['nsel_std'].round(2).astype(str)
    )

    table = pd.DataFrame({
        'Accuracy (%)': acc_report,
        'n_selected': nsel_report,
    }).T

    # Enforce method order
    table = table.reindex(columns=[m for m in METHODS if m in table.columns])

    print(f"\nDataset: {dataset}")
    display(table)


Dataset: aids


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),86.41 ± 0.97,87.65 ± 1.18,86.42 ± 2.32,88.09 ± 1.7,84.82 ± 1.49,88.79 ± 1.15
n_selected,21.24 ± 1.51,22.12 ± 2.96,2.8 ± 0.93,3.16 ± 0.62,4.2 ± 0.78,6.62 ± 1.09



Dataset: bankruptcy


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),96.82 ± 0.14,96.78 ± 0.12,96.72 ± 0.19,96.76 ± 0.14,97.01 ± 0.19,96.68 ± 0.28
n_selected,20.48 ± 15.44,50.26 ± 38.04,6.74 ± 1.27,2.74 ± 0.94,21.66 ± 2.0,10.08 ± 2.15



Dataset: basehock


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),97.71 ± 0.55,95.88 ± 0.8,83.25 ± 13.82,84.71 ± 7.91,94.58 ± 0.85,94.82 ± 0.95
n_selected,3839.9 ± 882.08,4247.6 ± 1538.37,26.68 ± 4.97,13.72 ± 2.59,50.84 ± 4.97,54.42 ± 4.57



Dataset: breast_cancer


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),96.98 ± 1.03,96.92 ± 1.01,95.53 ± 8.46,94.33 ± 1.31,95.65 ± 1.29,95.67 ± 1.31
n_selected,15.94 ± 6.81,25.06 ± 6.91,4.24 ± 0.62,2.06 ± 0.24,21.52 ± 1.15,9.24 ± 1.44



Dataset: coil20


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),98.08 ± 0.72,97.93 ± 1.0,91.21 ± 2.41,89.15 ± 2.62,99.82 ± 0.23,98.32 ± 0.69
n_selected,1021.7 ± 8.14,1012.04 ± 38.99,29.16 ± 4.63,9.38 ± 1.03,946.86 ± 3.61,114.74 ± 23.38



Dataset: dna


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),95.54 ± 0.49,95.05 ± 0.67,93.68 ± 0.89,91.51 ± 1.72,95.54 ± 0.59,94.24 ± 0.66
n_selected,106.1 ± 44.62,60.88 ± 20.56,11.44 ± 0.86,8.54 ± 0.95,84.52 ± 3.65,15.7 ± 2.23



Dataset: dry_bean


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),92.44 ± 0.37,92.29 ± 0.35,92.28 ± 0.41,92.25 ± 0.48,92.24 ± 0.33,92.63 ± 0.39
n_selected,15.96 ± 0.2,16.0 ± 0.0,8.68 ± 0.84,5.86 ± 0.35,16.0 ± 0.0,13.0 ± 0.0



Dataset: har


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),98.24 ± 0.23,97.67 ± 0.25,96.09 ± 0.46,92.97 ± 0.78,97.75 ± 0.28,98.4 ± 0.28
n_selected,555.06 ± 18.57,560.6 ± 2.56,32.2 ± 4.22,8.6 ± 1.11,468.32 ± 3.43,61.88 ± 6.51



Dataset: isolet


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),95.91 ± 0.36,94.35 ± 0.52,83.13 ± 1.4,81.32 ± 1.72,94.39 ± 0.35,92.66 ± 0.62
n_selected,615.0 ± 10.77,616.96 ± 0.28,35.44 ± 2.63,15.24 ± 1.61,603.22 ± 3.24,98.1 ± 9.08



Dataset: pcmac


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),90.02 ± 0.92,89.85 ± 1.23,80.33 ± 9.58,81.68 ± 7.51,84.77 ± 1.4,89.19 ± 1.28
n_selected,1907.02 ± 698.16,264.22 ± 49.55,18.08 ± 2.64,11.82 ± 2.85,14.38 ± 1.75,31.04 ± 5.11



Dataset: spambase


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),92.33 ± 0.61,93.49 ± 0.61,90.53 ± 7.66,91.39 ± 4.56,94.52 ± 0.54,94.76 ± 0.47
n_selected,55.4 ± 2.29,56.58 ± 1.92,24.9 ± 0.81,16.46 ± 2.78,19.12 ± 0.96,22.9 ± 1.49



Dataset: statlog


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),85.83 ± 0.62,87.96 ± 0.66,85.66 ± 0.66,84.96 ± 0.86,91.21 ± 0.58,91.36 ± 0.56
n_selected,36.0 ± 0.0,36.0 ± 0.0,22.38 ± 1.63,4.14 ± 0.64,36.0 ± 0.0,35.72 ± 0.54



Dataset: usps


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),94.05 ± 0.37,95.13 ± 0.39,90.64 ± 0.58,88.05 ± 1.19,96.31 ± 0.3,95.59 ± 0.37
n_selected,251.48 ± 2.47,256.0 ± 0.0,53.88 ± 2.54,17.52 ± 2.54,212.4 ± 1.28,108.98 ± 4.61



Dataset: wine


method,LASSO_CV,LassoNet,LASSO_QUT,HarderLASSO_QUT,RandomForest,XGBoost
Accuracy (%),97.27 ± 1.68,97.13 ± 2.1,95.97 ± 2.43,89.57 ± 3.73,97.7 ± 1.65,96.07 ± 2.69
n_selected,10.2 ± 1.44,11.5 ± 1.76,4.9 ± 0.36,2.16 ± 0.37,13.0 ± 0.0,7.0 ± 1.6


In [8]:
LINEAR   = ['LASSO_CV']
ANN      = ['LassoNet', 'LASSO_QUT','HarderLASSO_QUT']
ENSEMBLE = ['RandomForest','XGBoost']
ORDER = LINEAR + ANN + ENSEMBLE

def cell(mean_sel, std_sel, mean_acc):
    txt1 = f"{mean_sel:.2f}"
    txt2 = f"({mean_acc*100:.2f})"
    return txt1 + " " + txt2

def print_compact_table(summary_df):
    for ds in summary_df['dataset'].unique():
        sub = summary_df[summary_df['dataset'] == ds].set_index('method')

        row_cells = []
        for m in ORDER:
            if m in sub.index:
                row_cells.append(cell(sub.loc[m,'nsel_mean'],
                                      sub.loc[m,'nsel_std'],
                                      sub.loc[m,'acc_mean']))
            else:
                row_cells.append(r'\text{--}')
        print(ds.replace("_", " ") + " & " + " & ".join(row_cells) + r" \\")


print_compact_table(summary_df)

aids & 21.24 (86.41) & 22.12 (87.65) & 2.80 (86.42) & 3.16 (88.09) & 4.20 (84.82) & 6.62 (88.79) \\
bankruptcy & 20.48 (96.82) & 50.26 (96.78) & 6.74 (96.72) & 2.74 (96.76) & 21.66 (97.01) & 10.08 (96.68) \\
basehock & 3839.90 (97.71) & 4247.60 (95.88) & 26.68 (83.25) & 13.72 (84.71) & 50.84 (94.58) & 54.42 (94.82) \\
breast cancer & 15.94 (96.98) & 25.06 (96.92) & 4.24 (95.53) & 2.06 (94.33) & 21.52 (95.65) & 9.24 (95.67) \\
coil20 & 1021.70 (98.08) & 1012.04 (97.93) & 29.16 (91.21) & 9.38 (89.15) & 946.86 (99.82) & 114.74 (98.32) \\
dna & 106.10 (95.54) & 60.88 (95.05) & 11.44 (93.68) & 8.54 (91.51) & 84.52 (95.54) & 15.70 (94.24) \\
dry bean & 15.96 (92.44) & 16.00 (92.29) & 8.68 (92.28) & 5.86 (92.25) & 16.00 (92.24) & 13.00 (92.63) \\
har & 555.06 (98.24) & 560.60 (97.67) & 32.20 (96.09) & 8.60 (92.97) & 468.32 (97.75) & 61.88 (98.40) \\
isolet & 615.00 (95.91) & 616.96 (94.35) & 35.44 (83.13) & 15.24 (81.32) & 603.22 (94.39) & 98.10 (92.66) \\
pcmac & 1907.02 (90.02) & 264.22 (89

In [9]:
LINEAR   = ['LASSO_CV']
ANN      = ['LassoNet', 'LASSO_QUT','HarderLASSO_QUT']
ENSEMBLE = ['RandomForest','XGBoost']
ORDER = LINEAR + ANN + ENSEMBLE

def cell(mean_sel, std_sel, mean_acc):
    txt1 = f"{mean_sel:.2f} $\pm$ {std_sel:.2f}"
    txt2 = f"({mean_acc*100:.2f})"
    return txt1

def print_compact_table(summary_df):
    for ds in summary_df['dataset'].unique():
        sub = summary_df[summary_df['dataset'] == ds].set_index('method')

        row_cells = []
        for m in ORDER:
            if m in sub.index:
                row_cells.append(cell(sub.loc[m,'nsel_mean'],
                                      sub.loc[m,'nsel_std'],
                                      sub.loc[m,'acc_mean']))
            else:
                row_cells.append(r'\text{--}')
        print(ds.replace("_", " ") + " & " + " & ".join(row_cells) + r" \\")


print_compact_table(summary_df)

aids & 21.24 $\pm$ 1.51 & 22.12 $\pm$ 2.96 & 2.80 $\pm$ 0.93 & 3.16 $\pm$ 0.62 & 4.20 $\pm$ 0.78 & 6.62 $\pm$ 1.09 \\
bankruptcy & 20.48 $\pm$ 15.44 & 50.26 $\pm$ 38.04 & 6.74 $\pm$ 1.27 & 2.74 $\pm$ 0.94 & 21.66 $\pm$ 2.00 & 10.08 $\pm$ 2.15 \\
basehock & 3839.90 $\pm$ 882.08 & 4247.60 $\pm$ 1538.37 & 26.68 $\pm$ 4.97 & 13.72 $\pm$ 2.59 & 50.84 $\pm$ 4.97 & 54.42 $\pm$ 4.57 \\
breast cancer & 15.94 $\pm$ 6.81 & 25.06 $\pm$ 6.91 & 4.24 $\pm$ 0.62 & 2.06 $\pm$ 0.24 & 21.52 $\pm$ 1.15 & 9.24 $\pm$ 1.44 \\
coil20 & 1021.70 $\pm$ 8.14 & 1012.04 $\pm$ 38.99 & 29.16 $\pm$ 4.63 & 9.38 $\pm$ 1.03 & 946.86 $\pm$ 3.61 & 114.74 $\pm$ 23.38 \\
dna & 106.10 $\pm$ 44.62 & 60.88 $\pm$ 20.56 & 11.44 $\pm$ 0.86 & 8.54 $\pm$ 0.95 & 84.52 $\pm$ 3.65 & 15.70 $\pm$ 2.23 \\
dry bean & 15.96 $\pm$ 0.20 & 16.00 $\pm$ 0.00 & 8.68 $\pm$ 0.84 & 5.86 $\pm$ 0.35 & 16.00 $\pm$ 0.00 & 13.00 $\pm$ 0.00 \\
har & 555.06 $\pm$ 18.57 & 560.60 $\pm$ 2.56 & 32.20 $\pm$ 4.22 & 8.60 $\pm$ 1.11 & 468.32 $\pm$ 3.43 & 61.88 $\p