# HarderLASSO: Comprehensive Examples

This notebook demonstrates the capabilities of the HarderLASSO library for neural network-based feature selection across different machine learning tasks.

## Table of Contents
1. [Setup and Imports](#setup)
2. [Regression Examples](#regression)
3. [Classification Examples](#classification)
4. [Survival Analysis Examples](#survival)

## 1. Setup and Imports {#setup}

In [1]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_digits, load_breast_cancer, make_regression, make_classification

# HarderLASSO imports
from HarderLASSO import HarderLASSORegressor, HarderLASSOClassifier, HarderLASSOCox

# Survival analysis
from lifelines.datasets import load_rossi, load_kidney_transplant

print("Setup complete!")

Setup complete!


## 2. Regression Examples {#regression}

### 2.1 Synthetic High-Dimensional Data

In [2]:
# Generate high-dimensional synthetic data
n_samples, n_features = 200, 100
n_informative = 10


print(f"Generating synthetic regression data:")
print(f"- Samples: {n_samples}")
print(f"- Features: {n_features}")
print(f"- Informative features: {n_informative}")

X= np.random.normal(size=(n_samples, n_features))
features = np.arange(n_informative)
beta = 3*np.ones(n_informative)
y = X[:, features]@beta + np.random.normal(size=n_samples)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Generating synthetic regression data:
- Samples: 200
- Features: 100
- Informative features: 10

Training set: (140, 100)
Test set: (60, 100)


In [3]:
# Fit HarderLASSO Regressor
print("Training HarderLASSO Regressor...")

model_reg = HarderLASSORegressor(
    hidden_dims=(20, ),  # One hidden layers
    penalty='harder'
)

model_reg.fit(X_train, y_train, verbose=False)

# Make predictions
y_pred_train = model_reg.predict(X_train)
y_pred_test = model_reg.predict(X_test)

# Evaluate performance
train_metrics = model_reg.score(X_train, y_train)
test_metrics = model_reg.score(X_test, y_test)

print(f"\n=== Regression Results ===")
print(f"Selected features: {len(model_reg.selected_features_indices_)} / {n_features}")
print(f"Lambda QUT: {model_reg.lambda_qut_:.4f}")
print(f"Train R²: {train_metrics['R2']:.4f}")
print(f"Test R²: {test_metrics['R2']:.4f}")
print(f"Train MSE: {train_metrics['MSE']:.4f}")
print(f"Test MSE: {test_metrics['MSE']:.4f}")
print(f"Selected feature indices: {model_reg.selected_features_indices_}")

Training HarderLASSO Regressor...

=== Regression Results ===
Selected features: 10 / 100
Lambda QUT: 3.4477
Train R²: 0.9896
Test R²: 0.9851
Train MSE: 0.9701
Test MSE: 1.0853
Selected feature indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


### 2.2 Comparison with Different Penalties

In [4]:
# Compare different penalty functions
penalties = ['harder', 'l1', 'scad']
results = {}

print("Comparing different penalty functions...")

for penalty in penalties:
    print(f"\nTraining with {penalty} penalty...")

    model = HarderLASSORegressor(
        hidden_dims=(20,),
        penalty=penalty
    )

    model.fit(X_train, y_train, verbose=False)

    # Evaluate
    test_score = model.score(X_test, y_test)
    n_selected = len(model.selected_features_indices_)

    results[penalty] = {
        'test_r2': test_score['R2'],
        'n_features': n_selected,
        'lambda': model.lambda_qut_
    }

    print(f"  Test R²: {test_score['R2']:.4f}")
    print(f"  Selected features: {n_selected}")
    print(f"  Lambda: {model.lambda_qut_:.4f}")

# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
print("\n=== Penalty Comparison ===")
print(comparison_df.round(4))

Comparing different penalty functions...

Training with harder penalty...
  Test R²: 0.9859
  Selected features: 10
  Lambda: 3.4060

Training with l1 penalty...
  Test R²: -12.8899
  Selected features: 10
  Lambda: 3.4367

Training with scad penalty...
  Test R²: 0.9893
  Selected features: 10
  Lambda: 3.4213

=== Penalty Comparison ===
        test_r2  n_features  lambda
harder   0.9859        10.0  3.4060
l1     -12.8899        10.0  3.4367
scad     0.9893        10.0  3.4213


## 3. Classification Examples {#classification}

### 3.1 Digit Recognition

In [5]:
# Load digits dataset
print("Loading digits dataset...")
X_digits, y_digits = load_digits(return_X_y=True)

print(f"Dataset shape: {X_digits.shape}")
print(f"Number of classes: {len(np.unique(y_digits))}")
print(f"Class distribution: {np.bincount(y_digits)}")

# Split the data
X_train_dig, X_test_dig, y_train_dig, y_test_dig = train_test_split(
    X_digits, y_digits, test_size=0.2, stratify=y_digits
)

print(f"\nTraining set: {X_train_dig.shape}")
print(f"Test set: {X_test_dig.shape}")

Loading digits dataset...
Dataset shape: (1797, 64)
Number of classes: 10
Class distribution: [178 182 177 183 181 182 181 179 174 180]

Training set: (1437, 64)
Test set: (360, 64)


In [6]:
# Train HarderLASSO Classifier
print("Training HarderLASSO Classifier...")

model_clf = HarderLASSOClassifier(
    hidden_dims=(20, ),
    penalty='harder'
)

model_clf.fit(X_train_dig, y_train_dig, verbose=False)

train_metrics = model_clf.score(X_train_dig, y_train_dig)
test_metrics = model_clf.score(X_test_dig, y_test_dig)

print(f"\n=== Classification Results ===")
print(f"Selected features: {len(model_clf.selected_features_indices_)} / {X_digits.shape[1]}")
print(f"Lambda QUT: {model_clf.lambda_qut_:.4f}")
print(f"Train accuracy: {train_metrics['accuracy']:.4f}")
print(f"Test accuracy: {test_metrics['accuracy']:.4f}")
print(f"Feature reduction: {(1 - len(model_clf.selected_features_indices_)/X_digits.shape[1]):.1%}")

Training HarderLASSO Classifier...

=== Classification Results ===
Selected features: 9 / 64
Lambda QUT: 168.6994
Train accuracy: 0.9506
Test accuracy: 0.9417
Feature reduction: 85.9%


### 3.2 Breast Cancer Classification

In [7]:
# Load digits dataset
print("Loading digits dataset...")
X_breast, y_breast = load_breast_cancer(return_X_y=True)

print(f"Dataset shape: {X_breast.shape}")
print(f"Number of classes: {len(np.unique(y_breast))}")
print(f"Class distribution: {np.bincount(y_breast)}")

# Split the data
X_train_breast, X_test_breast, y_train_breast, y_test_breast = train_test_split(
    X_breast, y_breast, test_size=0.2, stratify=y_breast
)

print(f"\nTraining set: {X_train_breast.shape}")
print(f"Test set: {X_test_breast.shape}")

Loading digits dataset...
Dataset shape: (569, 30)
Number of classes: 2
Class distribution: [212 357]

Training set: (455, 30)
Test set: (114, 30)


In [8]:
# Train classifier with feature names
print("Training classifier on breast cancer data...")

model_cancer = HarderLASSOClassifier(
    hidden_dims=(20, 10),
    penalty='harder'
)

model_cancer.fit(X_train_breast, y_train_breast, verbose=False)

# Evaluate
test_acc_cancer = model_cancer.score(X_test_breast, y_test_breast)['accuracy']
selected_features_cancer = model_cancer.selected_features_

print(f"\n=== Breast Cancer Classification ===")
print(f"Test accuracy: {test_acc_cancer:.4f}")
print(f"Selected features: {len(selected_features_cancer)} / {X.shape[1]}")
print(f"\nSelected feature names:")
for i, feature in enumerate(selected_features_cancer[:10]):
    print(f"  {i+1}. {feature}")
if len(selected_features_cancer) > 10:
    print(f"  ... and {len(selected_features_cancer) - 10} more")

Training classifier on breast cancer data...

=== Breast Cancer Classification ===
Test accuracy: 0.9825
Selected features: 3 / 100

Selected feature names:
  1. feature_20
  2. feature_21
  3. feature_27


## 4. Survival Analysis Examples {#survival}

### 4.1 Rossi Recidivism Dataset

In [9]:
# Load Rossi dataset
print("Loading Rossi recidivism dataset...")
df_rossi = load_rossi()

print(f"Dataset shape: {df_rossi.shape}")
print(f"Columns: {list(df_rossi.columns)}")
print(f"\nFirst few rows:")
print(df_rossi.head())

# Prepare data
X_rossi = df_rossi.drop(columns=['week', 'arrest'])
time_rossi = df_rossi['week']
event_rossi = df_rossi['arrest']

print(f"\nSurvival data summary:")
print(f"Number of events: {event_rossi.sum()} / {len(event_rossi)} ({event_rossi.mean():.1%})")
print(f"Median follow-up time: {time_rossi.median():.1f} weeks")
print(f"Features: {list(X_rossi.columns)}")

Loading Rossi recidivism dataset...
Dataset shape: (432, 9)
Columns: ['week', 'arrest', 'fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio']

First few rows:
   week  arrest  fin  age  race  wexp  mar  paro  prio
0    20       1    0   27     1     0    0     1     3
1    17       1    0   18     1     0    0     1     8
2    25       1    0   19     0     1    0     1    13
3    52       0    1   23     1     1    1     1     1
4    52       0    0   19     0     1    0     1     3

Survival data summary:
Number of events: 114 / 432 (26.4%)
Median follow-up time: 52.0 weeks
Features: ['fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio']


In [11]:
# Train HarderLASSO Cox model
print("Training HarderLASSO Cox model...")

model_cox = HarderLASSOCox(
    hidden_dims=None, # Linear model used
    penalty='harder'
)

model_cox.fit(X_rossi, (time_rossi, event_rossi), verbose=False)

# Evaluate model
concordance_index = model_cox.score(X_rossi, (time_rossi, event_rossi))['C-index']
selected_features_cox = model_cox.selected_features_

print(f"\n=== Cox Regression Results ===")
print(f"Concordance index: {concordance_index:.4f}")
print(f"Selected features: {len(selected_features_cox)} / {X_rossi.shape[1]}")
print(f"Lambda QUT: {model_cox.lambda_qut_:.4f}")

print(f"\nSelected features:")
for feature in selected_features_cox:
    print(f"  - {feature}")

# Show coefficients for selected features
if hasattr(model_cox, 'coef_') and len(selected_features_cox) > 0:
    print(f"\nCoefficients (hazard ratios):")
    selected_indices = model_cox.selected_features_indices_
    coefficients = model_cox.coef_[selected_indices]

    for feature, coef in zip(selected_features_cox, coefficients):
        hr = np.exp(coef)
        print(f"  {feature}: {coef:.3f} (HR: {hr:.3f})")

Training HarderLASSO Cox model...

=== Cox Regression Results ===
Concordance index: 0.6332
Selected features: 2 / 7
Lambda QUT: 0.5805

Selected features:
  - age
  - prio

Coefficients (hazard ratios):
  age: -0.421 (HR: 0.657)
  prio: 0.272 (HR: 1.313)
