# Tabular Network (TabNet)

Explainable Boosting Machines (EBMs) are tree-based models mainly used for processing tabular data. These inherently interpretable algorithms are a subset of Generalized Additive Models (GAMs), which can be as accurate as black-box models, e.g., XGBoost. [1] For more information, you can refer to [this link](https://interpret.ml/docs/ebm.html).

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
# from new_data import process_csv
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch
import os
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
from interpret.glassbox import ExplainableBoostingClassifier,merge_ebms

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
import pickle
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt


## Data
The dataset used in this notebook is US-130, which contains different  clinical measurements related to patients diagnosed with diabetes from 1999-2008. To find more information about this dataset, you can refer to [this link](https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008).

In [None]:
def process_csv(df):


    df["readmitted_binarized"] = df["readmitted"].apply(lambda x: 1 if x=="b'<30'" else 0)
    # dropping useless columns
    df = df.drop(['Unnamed: 0',"encounter_id","patient_nbr","examide", "readmitted","weight","payer_code","medical_specialty"], axis=1)


    # # age transformation was extracted from: https://medium.com/analytics-vidhya/diabetes-130-us-hospitals-for-years-1999-2008-e18d69beea4d
    age_dic = {"b'[0-10)'" : 5,
    "b'[10-20)'" : 15,
    "b'[20-30)'" : 25,
    "b'[30-40)'" : 35,
    "b'[40-50)'" : 45,
    "b'[50-60)'" : 55,
    "b'[60-70)'" : 65,
    "b'[70-80)'" : 75,
    "b'[80-90)'" : 85,
    "b'[90-100)'" : 95}

    df['age'] = df['age'].apply(lambda x : age_dic[x])

    df["diag_1"] = df["diag_1"].apply(lambda x: x[:x.find(".")])
    df["diag_2"] = df["diag_2"].apply(lambda x: x[:x.find(".")])
    df["diag_3"] = df["diag_3"].apply(lambda x: x[:x.find(".")])

    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    # https://www.kaggle.com/code/paulo100/tabtransformer-pytorch-dnn-with-attention-eda
    threshold = 3
    for col in df.columns:
        if df[col].nunique() < threshold and col not in categorical_columns:
            categorical_columns.append(col)

    for cat_column in categorical_columns:
      frequency_encoding = df[cat_column].value_counts(normalize=True).to_dict()
      df[f"encoded_{cat_column}"] = df[cat_column].map(frequency_encoding)
      df = df.drop(cat_column, axis=1)


    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_columns = [col for col in df.columns if col not in categorical_columns]

    return df

In [None]:
# data loading and preparation
data = pd.read_csv("/Users/ananyaraval/workspace/interpretability-bootcamp/data/US_130/diabetic_data.csv")
df = process_csv(data)
df.index=range(df.shape[0])
    # categorical_columns = df.select_dtypes(include=['object', 'category']).columns
X , y = df.drop("readmitted_binarized",axis=1) , df["readmitted_binarized"]
    # embedded_cols = {n: len(col.cat.categories) for n,col in X[categorical_columns].items() if len(col.cat.categories) > 2}
    # embedded_col_names = embedded_cols.keys()
    # embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
k = 5
indices = np.arange(len(X))
np.random.shuffle(indices)
folds = np.array_split(indices, k)


In [None]:
clf = TabNetClassifier()

        clf.fit(
X_train.values, y_train.values,max_epochs=20
# eval_set=[(X_test, y_test)]
)
y_pred = clf.predict(X_test.values)

y_prob = clf.predict_proba(X_test.values)[:,1]

val_auc = roc_auc_score(y_test, y_prob)
val_f1 = f1_score(y_test,y_pred)
val_precision = precision_score(y_test,y_pred)
val_recall = recall_score(y_test,y_pred)
print("val_auc",val_auc)

print("val_f1",val_f1)
print("val_precision",val_precision)
print("val_recall",val_recall)

### Global and Local Explainability:

Global explainability refers to the features important to the whole model, while local explainability demonstrates the features determining in a specific prediction for a datapoint.





In [None]:
# global explainabilit
clf.feature_importances_

As can be observed

In [None]:
# local explainability
explain_matrix, masks = clf.explain(X_test)

NameError: name 'clf' is not defined

In [None]:
# visulaizing the local feature importance
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")