In [2]:
# !pip install openml xgboost scikit-learn

In [15]:
############################################################
# 5. Unlearning logic
############################################################

def compute_grad_hess(booster, X_remove, y_remove):
    dremove = xgb.DMatrix(X_remove, label=y_remove, enable_categorical=True)
    raw_preds = booster.predict(dremove, output_margin=True)
    p = 1.0 / (1.0 + np.exp(-raw_preds))
    grad = p - y_remove
    hess = p * (1.0 - p)
    return grad, hess


def unlearn_points(booster, metadata, X_remove, y_remove, lambda_reg=1.0):
    grad, hess = compute_grad_hess(booster, X_remove, y_remove)
    leaf_indices = booster.predict(xgb.DMatrix(X_remove), pred_leaf=True)
    updated_metadata = []
    for tree_idx, tree_md in enumerate(metadata):
        new_tree_md = []
        leaf_grad_sum = np.zeros(len(tree_md))
        leaf_hess_sum = np.zeros(len(tree_md))
        for sample_idx, leaf_idx in enumerate(leaf_indices[:, tree_idx]):
            leaf_grad_sum[leaf_idx] += grad[sample_idx]
            leaf_hess_sum[leaf_idx] += hess[sample_idx]
        for leaf_idx, leaf_stats in enumerate(tree_md):
            leaf_copy = leaf_stats.copy()
            leaf_copy['sum_grad'] -= leaf_grad_sum[leaf_idx]
            leaf_copy['sum_hess'] -= leaf_hess_sum[leaf_idx]
            denom = leaf_copy['sum_hess'] + lambda_reg
            if denom <= 1e-12:
                leaf_copy['leaf_value'] = 0.0
            else:
                leaf_copy['leaf_value'] = -leaf_copy['sum_grad'] / denom
            new_tree_md.append(leaf_copy)
        updated_metadata.append(new_tree_md)
    return updated_metadata


def update_booster_with_metadata(booster, updated_metadata):
    old_trees = booster.get_dump(with_stats=True)
    new_trees_dump = []
    for tree_str, tree_meta in zip(old_trees, updated_metadata):
        updated_tree_lines = []
        leaf_counter = 0
        for line in tree_str.splitlines():
            if 'leaf=' in line:
                old_val_str = line.split('leaf=')[1].split(',')[0]
                new_val = tree_meta[leaf_counter]['leaf_value']
                line = line.replace(f"leaf={old_val_str}", f"leaf={new_val}")
                leaf_counter += 1
            updated_tree_lines.append(line)
        new_trees_dump.append("\n".join(updated_tree_lines))
    booster.set_dump(new_trees_dump)
    return booster

In [14]:
import openml
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from tqdm import tqdm


########################################
# 1. Fetch all classification datasets #
#    with < 1000 samples from OpenML   #
########################################

def fetch_classification_datasets_under_1000():
    all_ds = openml.datasets.list_datasets(
        status='active',
        size=100
    )
    df = pd.DataFrame.from_dict(all_ds, orient='index')
    print("Available columns in the dataset list:", df.columns)
    df = df[df['NumberOfInstances'] < 1000]
    target_col = 'default_target_attribute' if 'default_target_attribute' in df.columns else None
    if target_col:
        df = df.dropna(subset=[target_col])
    else:
        print("Warning: 'default_target_attribute' column not found. Skipping this filter.")
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Filtering datasets"):
        did = row['did']
        name = row['name']
        results.append((did, name))
    return results


###############################################################
# 2. Helper to load a dataset and check if it's 2-class only. #
###############################################################

def load_and_filter_dataset(did):
    try:
        ds = openml.datasets.get_dataset(did)
        name = ds.name
        target_name = ds.default_target_attribute
        if target_name is None:
            return None
        X, y, _, _ = ds.get_data(
            target=target_name,
            dataset_format='dataframe'
        )
        n_unique = y.nunique()
        if n_unique != 2:
            return None
        if not pd.api.types.is_numeric_dtype(y):
            le = LabelEncoder()
            y = le.fit_transform(y)
        else:
            unique_vals = sorted(y.unique())
            if set(unique_vals) != {0, 1}:
                le = LabelEncoder()
                y = le.fit_transform(y)

        # Convert object dtype to category
        for col in X.select_dtypes(include=['object']).columns:
            X[col] = X[col].astype('category')

        return X, y, name
    except Exception as e:
        return None


#################################################
# 3. Train XGBoost & extract leaf-level metadata
#################################################

def train_xgb_with_metadata(X, y, test_size=0.2, random_state=42, num_boost_round=10):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    params = {
        'objective': 'binary:logistic',
        'max_depth': 3,
        'learning_rate': 0.1,
        'lambda': 1.0,
        'verbosity': 0
    }
    booster = xgb.train(params, dtrain, num_boost_round=num_boost_round)
    dump = booster.get_dump(with_stats=True)
    metadata = []
    for tree_str in dump:
        tree_md = []
        for line in tree_str.splitlines():
            if 'leaf=' in line:
                segs = line.split(',')
                leaf_val, sum_grad, sum_hess = 0.0, 0.0, 0.0
                for s in segs:
                    s = s.strip()
                    if s.startswith("leaf="):
                        leaf_val = float(s.split('=')[1])
                    elif s.startswith("sum_grad="):
                        sum_grad = float(s.split('=')[1])
                    elif s.startswith("sum_hess="):
                        sum_hess = float(s.split('=')[1])
                tree_md.append({
                    'sum_grad': sum_grad,
                    'sum_hess': sum_hess,
                    'leaf_value': leaf_val
                })
        metadata.append(tree_md)
    return booster, (X_train, X_test, y_train, y_test), metadata


#################################################
# 4. Compute classification metrics
#################################################

def compute_classification_metrics(y_true, y_prob):
    y_pred = (y_prob >= 0.5).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    ll = log_loss(y_true, y_prob)
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'log_loss': ll
    }
    

#################################################
# 5. Main loop: run over all suitable datasets
#################################################

all_ds = fetch_classification_datasets_under_1000()
print(f"Found {len(all_ds)} total datasets with < 1000 samples (classification).")

results = []
unlearning_sizes = [10, 100, 500]

for did, name in tqdm(all_ds, desc="Processing datasets"):
    loaded = load_and_filter_dataset(did)
    if loaded is None:
        continue
    X, y, ds_name = loaded
    if len(X) < 2:
        continue
    
    try:
        booster, splits, metadata = train_xgb_with_metadata(X, y)
        X_train, X_test, y_train, y_test = splits
        
        dtest = xgb.DMatrix(X_test, enable_categorical=True)
        y_prob_before = booster.predict(dtest)
        metrics_before = compute_classification_metrics(y_test, y_prob_before)
        
        for unlearn_size in unlearning_sizes:
            num_remove = min(unlearn_size, len(X_train))
            X_remove = X_train.iloc[:num_remove]
            y_remove = y_train.iloc[:num_remove]
            
            # Simulate unlearning (update leaf values)
            # Update the logic here for unlearning if needed
            
            y_prob_after = booster.predict(dtest)
            metrics_after = compute_classification_metrics(y_test, y_prob_after)
            
            row = {
                'dataset_id': did,
                'dataset_name': ds_name,
                'n_samples': len(X),
                'unlearn_size': unlearn_size,
                'accuracy_before': metrics_before['accuracy'],
                'accuracy_after': metrics_after['accuracy'],
                'log_loss_before': metrics_before['log_loss'],
                'log_loss_after': metrics_after['log_loss']
            }
            results.append(row)
    except Exception as e:
        print(f"Error processing dataset {did} ({ds_name}): {e}")


#################################################
# 6. Convert to DataFrame and display or export #
#################################################

df_results = pd.DataFrame(results)
print("\nFinal Results:")
print(df_results)

df_results.to_csv("unlearning_results.csv", index=False)


  all_ds = openml.datasets.list_datasets(


Available columns in the dataset list: Index(['did', 'name', 'version', 'uploader', 'status', 'format',
       'MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',
       'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',
       'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
       'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'],
      dtype='object')


Filtering datasets: 100%|██████████| 33/33 [00:00<00:00, 14320.96it/s]


Found 33 total datasets with < 1000 samples (classification).


Processing datasets:   6%|▌         | 2/33 [00:00<00:01, 19.30it/s]

Error processing dataset 4 (labor): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  18%|█▊        | 6/33 [00:00<00:00, 30.69it/s]

Error processing dataset 13 (breast-cancer): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 15 (breast-w): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  33%|███▎      | 11/33 [00:00<00:00, 32.33it/s]

Error processing dataset 25 (colic): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 27 (colic): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 29 (credit-approval): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  45%|████▌     | 15/33 [00:00<00:00, 32.33it/s]

Error processing dataset 37 (diabetes): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  58%|█████▊    | 19/33 [00:00<00:00, 33.78it/s]

Error processing dataset 40 (sonar): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 43 (haberman): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 49 (heart-c): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  73%|███████▎  | 24/33 [00:00<00:00, 35.32it/s]

Error processing dataset 50 (tic-tac-toe): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 51 (heart-h): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets:  85%|████████▍ | 28/33 [00:00<00:00, 31.43it/s]

Error processing dataset 52 (trains): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 53 (heart-statlog): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 55 (hepatitis): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 56 (vote): 'numpy.ndarray' object has no attribute 'iloc'
Error processing dataset 59 (ionosphere): 'numpy.ndarray' object has no attribute 'iloc'


Processing datasets: 100%|██████████| 33/33 [00:01<00:00, 31.66it/s]


Final Results:
Empty DataFrame
Columns: []
Index: []



