In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
from sklearn.pipeline import Pipeline
from SafeTransformer import SafeTransformer
import warnings
try:
    from hmeasure import h_score
    hmeasure_available = True
except ImportError:
    hmeasure_available = False

datasets = {
    'pimaIndianDiabetes': {
        'path': 'datasets/pima indians diabetes.csv',
        'target': 'Outcome',
        'header': None,
        'columns': ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
    },
    'Heloc': {
        'path': 'datasets/heloc.csv',
        'target': 'RiskPerformance',
        'header': 0,
        'columns': None
    },
    'BankMarketingUCI': {
        'path': 'bank_marketing.csv',
        'target': 'RiskPerformance',  # <- this is the fixed target
        'header': 0,
        'columns': None
    }
}
results = []
warnings.filterwarnings('ignore')

for name, config in datasets.items():
    print(f"\n Training on: {name}")

    # Load
    df = pd.read_csv(config['path'], header=config['header'])
    if config['columns']:
        df.columns = config['columns']
    y = df[config['target']]
    X = df.drop(columns=config['target'])

    # Convert target
    if y.dtype == 'object':
        y = y.map({'yes': 1, 'no': 0, 'Good': 0, 'Bad': 1}).astype(int)

    # Dummies
    X = pd.get_dummies(X, drop_first=True)
    
    # Downcast to float32
    X = X.astype(np.float32)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=42, test_size=0.3
    )

    # Super fast surrogate
    surrogate_model = GradientBoostingClassifier(
        n_estimators=3,  # minimal
        max_depth=1,
        learning_rate=0.3,
        random_state=42
    )

    # Fast logistic regression
    log_model = LogisticRegression(max_iter=50, solver='liblinear')

    # Safe transformer - high penalty = few splits
    safe_transformer = SafeTransformer(
        surrogate_model,
        penalty=200,  # fewer splits
        pelt_model='l2',
        no_changepoint_strategy='median'
    )

    # Fit pipeline
    pipe = Pipeline([
        ('safe', safe_transformer),
        ('logreg', log_model)
    ])
    pipe.fit(X_train, y_train)

    # Predict
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]

    # Metrics
    results.append({
        'Dataset': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_proba),
        'Log Loss': log_loss(y_test, y_proba),
        'H-measure': h_score(y_test.to_numpy(), y_proba) if hmeasure_available else "Not installed"
    })

# Display results
print("\n Final Evaluation Summary:\n")
print(pd.DataFrame(results).to_string(index=False))



 Training on: pimaIndianDiabetes

 Training on: Heloc

 Training on: BankMarketingUCI

 Final Evaluation Summary:

           Dataset  Accuracy  F1 Score      AUC  Log Loss  H-measure
pimaIndianDiabetes  0.692641  0.422764 0.784774  0.521972   0.329809
             Heloc  0.651052  0.673621 0.700655  0.629886   0.147809
  BankMarketingUCI  0.887351  0.000000 0.700507  0.318771   0.154448
