In [2]:
import openml
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error

########################################
# 1. Fetch a small dataset from OpenML #
########################################

def load_dataset_from_openml(dataset_id):
    """
    Loads an OpenML dataset by ID, returning:
        X, y, is_classification, objective
    We assume it's either binary:logistic or reg:squarederror.
    """
    dataset = openml.datasets.get_dataset(dataset_id)
    # Identify the target:
    target_name = dataset.default_target_attribute
    if target_name is None:
        raise ValueError(f"Dataset {dataset_id} has no default target.")
    
    # Load as a Pandas DataFrame
    X, y, categorical, feature_names = dataset.get_data(
        target=target_name,
        dataset_format='dataframe'
    )
    
    # Attempt to detect classification vs. regression
    # We'll treat "small number of unique y" as classification
    num_unique = y.nunique()
    
    if pd.api.types.is_numeric_dtype(y) and num_unique > 2:
        # Probably a regression problem
        is_classification = False
        objective = 'reg:squarederror'
    else:
        # We treat as classification (binary or multi)
        is_classification = True
        
        # If there's more than 2 classes, we do a quick check to see if we can
        # filter it down to binary for demonstration. If not, we do a full multi-class approach.
        if num_unique > 2:
            print(f"Dataset {dataset_id} has {num_unique} classes. Demonstrating a 2-class subset.")
            # We'll pick just the first 2 classes
            keep_classes = y.unique()[:2]
            mask = y.isin(keep_classes)
            X, y = X[mask], y[mask]
            # Reindex
            X.reset_index(drop=True, inplace=True)
            y.reset_index(drop=True, inplace=True)
            num_unique = 2
        
        # Encode labels to 0/1
        le = LabelEncoder()
        y = le.fit_transform(y)
        
        # We'll do binary logistic for 2 classes
        if num_unique == 2:
            objective = 'binary:logistic'
        else:
            # If multi-class was truly needed, use 'multi:softprob'
            # For demonstration, we keep it at 2-class logistic
            objective = 'binary:logistic'
    
    return X, y, is_classification, objective


############################################
# 2. Train XGBoost & extract leaf metadata #
############################################

def train_xgb_with_metadata(X, y, objective, test_size=0.2, num_boost_round=10, random_state=42):
    """
    Trains an XGBoost model on (X, y) using the given objective, returns:
      - booster
      - (X_train, X_test, y_train, y_test)
      - metadata: list of per-tree stats (sum_grad, sum_hess, leaf_value)
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state,
        stratify=y if 'binary' in objective else None
    )
    
    # Basic params
    params = {
        'objective': objective,
        'max_depth': 3,
        'learning_rate': 0.1,
        'lambda': 1.0,
        'verbosity': 0
    }
    
    # Additional param for classification if needed
    if objective == 'binary:logistic':
        # In XGBoost, labels for binary logistic should be in {0,1}
        pass
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round
    )
    
    # Parse model dump for metadata
    metadata = []
    model_dump = booster.get_dump(with_stats=True)
    for tree_str in model_dump:
        tree_md = []
        for line in tree_str.splitlines():
            if 'leaf=' in line:
                # Typical line: "1:leaf=0.2,cover=4,sum_grad=...,sum_hess=..."
                segs = line.split(',')
                leaf_val, sum_grad, sum_hess = 0.0, 0.0, 0.0
                for s in segs:
                    s = s.strip()
                    if s.startswith("leaf="):
                        leaf_val = float(s.split('=')[1])
                    elif s.startswith("sum_grad="):
                        sum_grad = float(s.split('=')[1])
                    elif s.startswith("sum_hess="):
                        sum_hess = float(s.split('=')[1])
                tree_md.append({
                    'sum_grad': sum_grad,
                    'sum_hess': sum_hess,
                    'leaf_value': leaf_val
                })
        metadata.append(tree_md)
    
    return booster, (X_train, X_test, y_train, y_test), metadata


##############################################
# 3. Compute real gradient & hessian per row #
##############################################

def compute_grad_hess(booster, X_remove, y_remove, objective):
    """
    Compute the real first/second-order derivatives for the chosen objective.
    Currently supports:
      - 'binary:logistic'
      - 'reg:squarederror'
    """
    dremove = xgb.DMatrix(X_remove, label=y_remove)
    raw_preds = booster.predict(dremove, output_margin=True)
    
    if objective == 'binary:logistic':
        # p = sigmoid(raw_pred)
        # grad = p - y
        # hess = p*(1 - p)
        p = 1.0 / (1.0 + np.exp(-raw_preds))
        grad = p - y_remove
        hess = p * (1.0 - p)
    
    elif objective == 'reg:squarederror':
        # grad = raw_pred - y
        # hess = 1
        grad = raw_preds - y_remove
        hess = np.ones_like(y_remove, dtype=float)
    
    else:
        raise NotImplementedError(f"Objective '{objective}' not implemented for exact grad/hess.")
    
    return grad, hess


##################################################
# 4. Unlearn points by subtracting grad/hess sum #
##################################################

def unlearn_points(booster, metadata, X_remove, y_remove, objective, lambda_reg=1.0):
    """
    Subtract the sum of gradients/hessians for the removed points from each leaf's sums,
    then recompute leaf_value = -sum_grad / (sum_hess + lambda).

    NOTE: This version subtracts from EVERY leaf. Correct usage requires
          mapping each sample to its actual leaf in each tree.
    """
    grad, hess = compute_grad_hess(booster, X_remove, y_remove, objective)
    grad_sum = np.sum(grad)
    hess_sum = np.sum(hess)
    
    updated_metadata = []
    for tree_md in metadata:
        new_tree_md = []
        for leaf_stats in tree_md:
            leaf_copy = leaf_stats.copy()
            leaf_copy['sum_grad'] -= grad_sum
            leaf_copy['sum_hess'] -= hess_sum
            
            denom = leaf_copy['sum_hess'] + lambda_reg
            if denom <= 1e-12:
                leaf_copy['leaf_value'] = 0.0
            else:
                leaf_copy['leaf_value'] = -leaf_copy['sum_grad'] / denom
            new_tree_md.append(leaf_copy)
        updated_metadata.append(new_tree_md)
    return updated_metadata


###############################################
# 5. Inject updated leaf values back to model #
###############################################

def update_booster_with_metadata(booster, updated_metadata):
    """
    Rebuild textual dumps from updated_metadata and set them back into the booster.
    """
    old_trees = booster.get_dump(with_stats=True)
    new_trees_dump = []
    
    for tree_str, tree_meta in zip(old_trees, updated_metadata):
        updated_tree_lines = []
        leaf_counter = 0
        
        for line in tree_str.splitlines():
            if 'leaf=' in line:
                old_val_str = line.split('leaf=')[1].split(',')[0]
                new_val = tree_meta[leaf_counter]['leaf_value']
                line = line.replace(f"leaf={old_val_str}", f"leaf={new_val}")
                leaf_counter += 1
            updated_tree_lines.append(line)
        
        new_trees_dump.append("\n".join(updated_tree_lines))
    
    # Overwrite the booster with updated dump
    booster.set_dump(new_trees_dump)
    return booster


#####################################
# 6. Run a small demonstration cell #
#####################################

def demo_openml_unlearning(dataset_id, test_size=0.2):
    """
    Fetch an OpenML dataset, train an XGB model, unlearn a few points,
    and compare performance before vs. after unlearning.
    """
    print(f"Fetching dataset {dataset_id} from OpenML...")
    X, y, is_classification, objective = load_dataset_from_openml(dataset_id)
    print(f"Dataset shape: {X.shape}, objective={objective}")
    
    booster, (X_train, X_test, y_train, y_test), metadata = train_xgb_with_metadata(
        X, y, objective, test_size=test_size
    )
    
    # Evaluate before unlearning
    dtest = xgb.DMatrix(X_test)
    preds = booster.predict(dtest)
    
    if objective == 'binary:logistic':
        # convert raw probs to 0/1 predictions
        preds_label = (preds > 0.5).astype(int)
        acc_before = accuracy_score(y_test, preds_label)
        print(f"Before unlearning => Accuracy: {acc_before:.4f}")
    else:
        # regression
        mse_before = mean_squared_error(y_test, preds)
        print(f"Before unlearning => MSE: {mse_before:.4f}")
    
    # Choose some subset to remove from training
    num_remove = min(10, len(X_train))
    X_remove = X_train[:num_remove]
    y_remove = y_train[:num_remove]
    
    # Perform unlearning
    updated_md = unlearn_points(
        booster,
        metadata,
        X_remove,
        y_remove,
        objective
    )
    booster = update_booster_with_metadata(booster, updated_md)
    
    # Evaluate after unlearning
    preds_after = booster.predict(dtest)
    if objective == 'binary:logistic':
        preds_label_after = (preds_after > 0.5).astype(int)
        acc_after = accuracy_score(y_test, preds_label_after)
        print(f"After unlearning => Accuracy: {acc_after:.4f}")
    else:
        mse_after = mean_squared_error(y_test, preds_after)
        print(f"After unlearning => MSE: {mse_after:.4f}")


############################################
# 7. Example usage: pick a dataset and run #
############################################

# Common small datasets:
#   61  => Iris (we turn it into a 2-class subset if needed)
#   1476 => Airfoil Self-Noise (regression)

dataset_id = 61  # Iris => objective='binary:logistic' (in 2-class mode)
# dataset_id = 1476  # Airfoil => objective='reg:squarederror'

demo_openml_unlearning(dataset_id)


ModuleNotFoundError: No module named 'openml'