In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from matplotlib import pyplot as plt
import seaborn as sns

from interpret.glassbox import ExplainableBoostingClassifier
import optuna #bayesian optimization
from sklearn.metrics import roc_auc_score, roc_curve
from interpret import show



## Data Preprocessing

### Address Missingness

In [2]:
# load in data
df_train = pd.read_csv('insurance_t.csv')
df_val = pd.read_csv('insurance_v.csv')

# Find columns with missing data in train and validation datasets
miss_t = [col for col in df_train.columns if df_train[col].isnull().sum() > 0]
miss_v = [col for col in df_val.columns if df_val[col].isnull().sum() > 0]

# Check if train and validation datasets are missing the same columns
print(miss_t == miss_v)

# Check what and how many values are missing in the train dataset
missing_counts_train = df_train[miss_t].isnull().sum()
print(missing_counts_train)

True
ACCTAGE     546
PHONE      1075
POS        1075
POSAMT     1075
INV        1075
INVBAL     1075
CC         1075
CCBAL      1075
CCPURC     1075
INCOME     1537
LORES      1537
HMVAL      1537
AGE        1702
CRSCORE     195
dtype: int64


In [3]:
# Define columns w/ missingness
continuous_cols = ["ACCTAGE", "PHONE", "POS", "POSAMT", "INVBAL", "CCBAL", "INCOME", "LORES", "HMVAL", "AGE", "CRSCORE"]
binary_cols = ["INV", "CC", "CCPURC"]

#replace ns's with 'Missing', 1's w/ 'Instance', 0' w/ 'Non-Instance' for train and val
for col in binary_cols:
    df_train[col] = df_train[col].replace({1: 'Instance', 0: 'Non-Instance'}).fillna('Missing')
    df_val[col] = df_val[col].replace({1: 'Instance', 0: 'Non-Instance'}).fillna('Missing')


#impute with median and create binary flag column
for col in continuous_cols:
    median_value = df_train[col].median()
    # Create a binary flag variable for imputation
    df_train[f'{col}_imputed'] = df_train[col].isnull().astype(int)
    #impute the missing values with the median
    df_train[col] = df_train[col].fillna(median_value)

#do the same for val
for col in continuous_cols:
    median_value = df_train[col].median()
    # Create a binary flag variable for imputation
    df_val[f'{col}_imputed'] = df_val[col].isnull().astype(int)
    #impute the missing values with the median
    df_val[col] = df_val[col].fillna(median_value)

### One-Hot Encode

In [4]:
# Separate binary columns and categorical columns for one-hot encoding
binary_columns = [
    "INAREA", "SDB", "MM", "IRA", "CD", "ATM", "SAV", "NSF", "DIRDEP", "DDA", "ACCTAGE_imputed", "PHONE_imputed", "POS_imputed", 
    "POSAMT_imputed", "INVBAL_imputed", "CCBAL_imputed", 
    "INCOME_imputed", "LORES_imputed", "HMVAL_imputed", 
    "AGE_imputed", "CRSCORE_imputed"
]

categorical_columns = [
    "INV", "CC", "CCPURC", "BRANCH"
]

# Apply one-hot encoding only to the specified categorical columns
df_train = pd.get_dummies(df_train, columns=categorical_columns, drop_first=False)
df_val = pd.get_dummies(df_val, columns=categorical_columns, drop_first=False)

# Label encode the MMCRED column in both datasets
label_encoder = LabelEncoder()
df_train['MMCRED'] = label_encoder.fit_transform(df_train['MMCRED'])
df_val['MMCRED'] = label_encoder.transform(df_val['MMCRED'])

# Ensure both train and validation have the same columns
df_val = df_val.reindex(columns=df_train.columns, fill_value=0)

# Convert binary columns to 0/1 (if needed)
for col in binary_columns:
    df_train[col] = df_train[col].astype(int)
    df_val[col] = df_val[col].astype(int)

In [5]:
# Define X as all columns except 'INS' and y as the 'INS' column
X_train = df_train.drop(columns=['INS'])
y_train = df_train['INS']

X_val = df_val.drop(columns=['INS'])
y_val = df_val['INS']

## Explainable Boosting Machine Model

The guide to Hyperparameters tuning is avaliable [here](https://interpret.ml/docs/hyperparameters.html)

In [7]:
# Define the objective function for Bayesian optimization
def objective(trial):
    # Hyperparameters to tune
    params = {
        'max_leaves': trial.suggest_categorical('max_leaves', [2,3]),
        'smoothing_rounds': trial.suggest_categorical('smoothing_rounds', [65, 70, 75, 80 ,85]),
        'interactions': trial.suggest_categorical('interactions', [0.0, 0.9, 0.95, 0.99, 5, 10]),
        'validation_size': trial.suggest_categorical('validation_size', [0.15, 0.2, 0.25, 0.3, 0.35])
    }
    
    # Train the ExplainableBoostingClassifier
    ebm = ExplainableBoostingClassifier(n_jobs=-1, random_state=88, **params)
    ebm.fit(X_train, y_train)
    
    # Evaluate on the validation set (or use cross-validation)
    y_pred_proba = ebm.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred_proba)
    
    return auc

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=88))
study.optimize(objective, n_trials=50, n_jobs=-1)

# Best hyperparameters and performance
print("Best Hyperparameters:", study.best_params)
print("Best AUC:", study.best_value)

# Train and evaluate the final model with best hyperparameters
best_params = study.best_params
final_model = ExplainableBoostingClassifier(n_jobs=-1, random_state=88, **best_params)
final_model.fit(X_train, y_train)
final_auc = roc_auc_score(y_val, final_model.predict_proba(X_val)[:, 1])
print("Final Test AUC: {:.3f}".format(final_auc))


[I 2024-11-22 12:30:14,096] A new study created in memory with name: no-name-22d02b9b-ce79-4131-91dd-5e93be99bc57
[I 2024-11-22 12:31:23,937] Trial 0 finished with value: 0.7946791828710296 and parameters: {'max_leaves': 2, 'smoothing_rounds': 85, 'interactions': 0.95, 'validation_size': 0.3}. Best is trial 0 with value: 0.7946791828710296.
[I 2024-11-22 12:31:31,255] Trial 7 finished with value: 0.7933841340921591 and parameters: {'max_leaves': 2, 'smoothing_rounds': 65, 'interactions': 0.9, 'validation_size': 0.2}. Best is trial 0 with value: 0.7946791828710296.
[I 2024-11-22 12:31:37,288] Trial 6 finished with value: 0.7939585194315828 and parameters: {'max_leaves': 2, 'smoothing_rounds': 65, 'interactions': 5, 'validation_size': 0.3}. Best is trial 0 with value: 0.7946791828710296.
[I 2024-11-22 12:31:38,624] Trial 1 finished with value: 0.7923835918880017 and parameters: {'max_leaves': 3, 'smoothing_rounds': 80, 'interactions': 10, 'validation_size': 0.25}. Best is trial 0 with va

Best Hyperparameters: {'max_leaves': 2, 'smoothing_rounds': 85, 'interactions': 10, 'validation_size': 0.3}
Best AUC: 0.7968567761867054
Final Test AUC: 0.797


In [10]:
best_ebm = ExplainableBoostingClassifier(max_leaves=2, smoothing_rounds=85, interactions=10, validation_size=0.3)
best_ebm.fit(X_train, y_train)

In [13]:
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())

show(best_ebm.explain_global())

In [17]:
show(best_ebm.explain_local(X_val[:5], y_val[:5]), 0)