## Load Datasets

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
import warnings
warnings.filterwarnings("ignore")

def load_breast_data():
    breast = load_breast_cancer()
    feature_names = list(breast.feature_names)
    X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X,
            'y': y,
        },
    }
    return dataset


def load_adult_data():
    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None)
    df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
    ]
    # If income column is < 50K, set to 0, else set to 1
    df['Income'] = df['Income'].apply(lambda x: 0 if x == " <=50K" else 1)
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]

    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }

    return dataset

def load_heart_data():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    # df = pd.read_csv(r'C:\develop\data\heart-disease-uci\heart.csv')
    df = pd.read_csv(f"datasets/heart.csv")
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset


def load_credit_data():
    # https://www.kaggle.com/mlg-ulb/creditcardfraud
    # df = pd.read_csv(r'C:\develop\data\creditcardfraud\creditcard.csv')
    df = pd.read_csv(f"datasets/creditcard.csv")
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset


def load_telco_churn_data():
    # https://www.kaggle.com/blastchar/telco-customer-churn/downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv/1
    # df = pd.read_csv(r'C:\develop\data\telco-customer-churn\WA_Fn-UseC_-Telco-Customer-Churn.csv')
    df = pd.read_csv(f"datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv")
    train_cols = df.columns[1:-1] # First column is an ID
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label] # 'Yes, No'
    y_df = y_df.apply(lambda x: 1 if x == 'Yes' else 0)
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset

## Data Stats

In [37]:
def dataset_stats(dataset, dataset_name, domain, task):
    X = dataset['full']['X']
    y = dataset['full']['y']
    N = len(X)
    K = X.shape[1]
    # Ensure y is numeric (0 and 1) so that .mean() gives fraction of positives.
    pos_pct = y.mean() * 100  
    return {
         "Dataset": dataset_name,
         "Domain": domain,
         "N": N,
         "K": K,
         "Task": task,
         "%Pos": f"{pos_pct:.1f}%"
    }

# Load datasets
adult_data = load_adult_data()
telco_churn_data = load_telco_churn_data()
credit_data = load_credit_data()


# Compute statistics for each dataset.
stats_list = []
stats_list.append(dataset_stats(adult_data, "Adult Income", "Finance", "Clas"))
stats_list.append(dataset_stats(telco_churn_data, "Telco Churn", "Business", "Clas"))
stats_list.append(dataset_stats(credit_data, "Credit", "Finance", "Clas"))

stats_df = pd.DataFrame(stats_list)
print(stats_df)


        Dataset    Domain       N   K  Task   %Pos
0  Adult Income   Finance   32561  14  Clas  24.1%
1   Telco Churn  Business    7043  19  Clas  26.5%
2        Credit   Finance  284807  30  Clas   0.2%


## Benchmark Models

In [9]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression

from interpret.glassbox import ExplainableBoostingClassifier, APLRClassifier


def format_n(x):
    return "{0:.3f}".format(x)

def process_model(clf, name, X, y, n_splits=3):
    # Evaluate model
    ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)
    scores = cross_validate(
        clf, X, y, scoring='roc_auc', cv=ss,
        n_jobs=None, return_estimator=True
    )

    record = dict()
    record['model_name'] = name
    record['fit_time_mean'] = format_n(np.mean(scores['fit_time']))
    record['fit_time_std'] = format_n(np.std(scores['fit_time']))
    record['test_score_mean'] = format_n(np.mean(scores['test_score']))
    record['test_score_std'] = format_n(np.std(scores['test_score']))

    return record



def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):
    if ct is None:
        is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
        cat_cols = X.columns.values[is_cat]
        num_cols = X.columns.values[~is_cat]

        cat_ohe_step = ('ohe', OneHotEncoder(sparse_output=False,
                                             handle_unknown='ignore'))

        cat_pipe = Pipeline([cat_ohe_step])
        num_pipe = Pipeline([('identity', FunctionTransformer())])
        transformers = [
            ('cat', cat_pipe, cat_cols),
            ('num', num_pipe, num_cols)
        ]
        ct = ColumnTransformer(transformers=transformers)

    records = []

    summary_record = {}
    summary_record['dataset_name'] = dataset_name
    print()
    print('-' * 78)
    print(dataset_name)
    print('-' * 78)
    print(summary_record)
    print()

    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('lr', LogisticRegression(random_state=random_state)),
    ])
    record = process_model(pipe, 'lr', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        # n_estimators updated from 10 to 100 due to sci-kit defaults changing in future versions
        ('rf-100', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)),
    ])
    record = process_model(pipe, 'rf-100', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('xgb', XGBClassifier(random_state=random_state, eval_metric='logloss')),
    ])
    record = process_model(pipe, 'xgb', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        ('aplr', APLRClassifier(random_state=random_state, num_first_steps_with_linear_effects_only=2000)),
    ])
    record = process_model(pipe, 'aplr', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    # No pipeline needed due to EBM handling string datatypes
    ebm_inter = ExplainableBoostingClassifier(n_jobs=-1, random_state=random_state)
    record = process_model(ebm_inter, 'ebm', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    return records

In [3]:
results = []
n_splits = 3

### Breast Cancer Dataset

In [10]:
dataset = load_breast_data()
result = benchmark_models('breast-cancer', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
breast-cancer
------------------------------------------------------------------------------
{'dataset_name': 'breast-cancer'}

{'model_name': 'lr', 'fit_time_mean': '0.020', 'fit_time_std': '0.007', 'test_score_mean': '0.994', 'test_score_std': '0.006'}
{'model_name': 'rf-100', 'fit_time_mean': '0.237', 'fit_time_std': '0.038', 'test_score_mean': '0.992', 'test_score_std': '0.009'}
{'model_name': 'xgb', 'fit_time_mean': '0.284', 'fit_time_std': '0.316', 'test_score_mean': '0.992', 'test_score_std': '0.010'}
{'model_name': 'aplr', 'fit_time_mean': '14.560', 'fit_time_std': '0.914', 'test_score_mean': '0.993', 'test_score_std': '0.006'}
{'model_name': 'ebm', 'fit_time_mean': '174.740', 'fit_time_std': '104.506', 'test_score_mean': '0.994', 'test_score_std': '0.009'}


### Adult Dataset

In [18]:
dataset = load_adult_data()
result = benchmark_models('adult', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
adult
------------------------------------------------------------------------------
{'dataset_name': 'adult'}

{'model_name': 'lr', 'fit_time_mean': '0.280', 'fit_time_std': '0.027', 'test_score_mean': '0.907', 'test_score_std': '0.003'}
{'model_name': 'rf-100', 'fit_time_mean': '0.976', 'fit_time_std': '0.014', 'test_score_mean': '0.903', 'test_score_std': '0.002'}
{'model_name': 'xgb', 'fit_time_mean': '0.464', 'fit_time_std': '0.082', 'test_score_mean': '0.928', 'test_score_std': '0.001'}
{'model_name': 'aplr', 'fit_time_mean': '323.301', 'fit_time_std': '28.697', 'test_score_mean': '0.927', 'test_score_std': '0.002'}
{'model_name': 'ebm', 'fit_time_mean': '19.054', 'fit_time_std': '1.447', 'test_score_mean': '0.929', 'test_score_std': '0.002'}


### Credit Card Dataset

In [20]:
dataset = load_credit_data()
result = benchmark_models('credit-fraud', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
credit-fraud
------------------------------------------------------------------------------
{'dataset_name': 'credit-fraud'}

{'model_name': 'lr', 'fit_time_mean': '0.622', 'fit_time_std': '0.063', 'test_score_mean': '0.980', 'test_score_std': '0.003'}
{'model_name': 'rf-100', 'fit_time_mean': '54.921', 'fit_time_std': '2.337', 'test_score_mean': '0.950', 'test_score_std': '0.007'}
{'model_name': 'xgb', 'fit_time_mean': '1.822', 'fit_time_std': '0.028', 'test_score_mean': '0.983', 'test_score_std': '0.002'}
{'model_name': 'aplr', 'fit_time_mean': '2250.017', 'fit_time_std': '257.963', 'test_score_mean': '0.979', 'test_score_std': '0.007'}
{'model_name': 'ebm', 'fit_time_mean': '480.415', 'fit_time_std': '489.951', 'test_score_mean': '0.982', 'test_score_std': '0.005'}


### Telco Churn Dataset

In [17]:
dataset = load_telco_churn_data()
result = benchmark_models('telco-churn', dataset['full']['X'], dataset['full']['y'], n_splits=3)
results.append(result)


------------------------------------------------------------------------------
telco-churn
------------------------------------------------------------------------------
{'dataset_name': 'telco-churn'}

{'model_name': 'lr', 'fit_time_mean': '1.976', 'fit_time_std': '0.091', 'test_score_mean': '0.808', 'test_score_std': '0.014'}
{'model_name': 'rf-100', 'fit_time_mean': '3.332', 'fit_time_std': '0.272', 'test_score_mean': '0.824', 'test_score_std': '0.002'}
{'model_name': 'xgb', 'fit_time_mean': '3.421', 'fit_time_std': '0.695', 'test_score_mean': '0.822', 'test_score_std': '0.004'}
{'model_name': 'aplr', 'fit_time_mean': '38.068', 'fit_time_std': '3.964', 'test_score_mean': '0.849', 'test_score_std': '0.003'}
{'model_name': 'ebm', 'fit_time_mean': '492.665', 'fit_time_std': '74.669', 'test_score_mean': '0.853', 'test_score_std': '0.004'}


In [21]:
records = [item for result in results for item in result]
record_df = pd.DataFrame.from_records(records)[['dataset_name', 'model_name', 'test_score_mean', 'test_score_std']]
record_df.to_csv('ebm-perf-classification-overnight.csv')

### DP-EBM Model (GDP and Classic) for different EBM Values for the Adult Dataset

In [None]:
from interpret.privacy import DPExplainableBoostingClassifier

# Run DP-EBM with different epsilon values for the adult dataset
epsilon_values = [0.5, 0.1, 2.0, 4.0, 8.0]
compositions = ['classic', 'gdp']
results = []

dataset = load_adult_data()
X = dataset['full']['X']
y = dataset['full']['y']
is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
cat_cols = X.columns.values[is_cat]
num_cols = X.columns.values[~is_cat]

cat_ohe_step = ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
cat_pipe = Pipeline([cat_ohe_step])
num_pipe = Pipeline([('identity', FunctionTransformer())])

transformers = [
    ('cat', cat_pipe, cat_cols),
    ('num', num_pipe, num_cols)
]
ct = ColumnTransformer(transformers=transformers)

for epsilon in epsilon_values:
    for composition in compositions:
        pipe = Pipeline([
            ('ct', ct),
            ('std', StandardScaler()),
            ('ebm', DPExplainableBoostingClassifier(epsilon=epsilon, composition=composition, n_jobs=-1, random_state=1337)),
        ])
        record = process_model(pipe, f'dp-ebm-epsilon-{epsilon}-composition-{composition}', X, y, n_splits=3)
        results.append(record)
        print(record)

    record = process_model(pipe, f'dp-ebm-epsilon-{epsilon}-composition-{composition}', X, y, n_splits=3)
    results.append(record)
    print(record)

record_df = pd.DataFrame.from_records(results)[['model_name', 'test_score_mean', 'test_score_std']]
record_df.to_csv('dp-ebm-perf-classification.csv')


{'model_name': 'dp-ebm-epsilon-0.5-composition-classic', 'fit_time_mean': '5.993', 'fit_time_std': '0.212', 'test_score_mean': '0.826', 'test_score_std': '0.003'}
{'model_name': 'dp-ebm-epsilon-0.5-composition-gdp', 'fit_time_mean': '6.229', 'fit_time_std': '0.360', 'test_score_mean': '0.871', 'test_score_std': '0.003'}
{'model_name': 'dp-ebm-epsilon-0.5-composition-gdp', 'fit_time_mean': '5.641', 'fit_time_std': '0.243', 'test_score_mean': '0.871', 'test_score_std': '0.003'}
{'model_name': 'dp-ebm-epsilon-0.1-composition-classic', 'fit_time_mean': '5.455', 'fit_time_std': '0.048', 'test_score_mean': '0.774', 'test_score_std': '0.002'}
{'model_name': 'dp-ebm-epsilon-0.1-composition-gdp', 'fit_time_mean': '5.634', 'fit_time_std': '0.374', 'test_score_mean': '0.814', 'test_score_std': '0.004'}
{'model_name': 'dp-ebm-epsilon-0.1-composition-gdp', 'fit_time_mean': '5.425', 'fit_time_std': '0.017', 'test_score_mean': '0.814', 'test_score_std': '0.004'}
{'model_name': 'dp-ebm-epsilon-2.0-com