# **Tutorial for Optimizing AUROC Loss on Tabular Data**

**Author**: Zhuoning Yuan \\

**Introduction**

In this tutorial, you will learn how to quickly train a MLP model by optimizing **AUROC** score using our novel optimization methods on **Tabular Data** ([Credit Fraud](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)). For image classification tutorial, please refer to [AUCM](https://github.com/Optimization-AI/LibAUC/blob/main/examples/02_Optimizing_AUROC_with_ResNet20_on_Imbalanced_CIFAR10.ipynb). After completion of this tutorial, you should be able to use LibAUC to train your own models on your own datasets.

**Useful Resources**

* Website: https://libauc.org
* Github: https://github.com/Optimization-AI/LibAUC

**References**

If you find this tutorial helpful,  please acknowledge our library and cite the following papers:
<pre>
@inproceedings{yuan2021large,
  title={Large-scale robust deep auc maximization: A new surrogate loss and empirical studies on medical image classification},
  author={Yuan, Zhuoning and Yan, Yan and Sonka, Milan and Yang, Tianbao},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={3040--3049},
  year={2021}
}
</pre>



In [None]:
!pip install libauc==1.2.0

# **Import LibAUC**

In [2]:
from libauc.losses import AUCMLoss
from libauc.optimizers import PESG
from libauc.sampler import DualSampler
from libauc.metrics import auroc
from libauc.models import MLP

import torch 
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import roc_auc_score

# **Reproducibility**

In [3]:
def set_all_seeds(SEED):
    # REPRODUCIBILITY
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# **Loading and Preprocessing Credit Fraud Dataset**
Reference: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

In [4]:
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


raw_df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
neg, pos = np.bincount(raw_df['Class'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

cleaned_df = raw_df.copy()

# You don't want the `Time` column.
cleaned_df.pop('Time')

# The `Amount` column covers a huge range. Convert to log-space.
eps = 0.001 # 0 => 0.1¢
cleaned_df['Log Ammount'] = np.log(cleaned_df.pop('Amount')+eps)

# Use a utility from sklearn to split and shuffle your dataset.
train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('Class'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('Class'))
test_labels = np.array(test_df.pop('Class'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

# Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1.
# Note: The StandardScaler is only fit using the train_features to be sure the model is not peeking at the validation or test sets.
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)


Examples:
    Total: 284807
    Positive: 492 (0.17% of total)

Training labels shape: (182276,)
Validation labels shape: (45569,)
Test labels shape: (56962,)
Training features shape: (182276, 29)
Validation features shape: (45569, 29)
Test features shape: (56962, 29)


# **Paramaters**

In [30]:
SEED = 123

# tunable parameters
BATCH_SIZE = 2048
lr = 0.1
epoch_decay = 0.002
weight_decay = 0
margin = 1.0

# sampling parameters
sampling_rate = 0.1 # e.g., this ensures 0.1*1024 = 102 positive samples in each mini-batch

# **Loading Dataset**

In [31]:
class CreditFraudDataset(Dataset):
    def __init__(self, data, target, shuffle=False):
        list_id = np.arange(len(data))
        if shuffle:
           np.random.seed(123)
           np.random.shuffle(list_id)
        self.data = data.astype(np.float32)[list_id] # numpy array
        self.targets = target.astype(np.float32)[list_id] # numpy array

    def __getitem__(self, index):
        data = self.data[index]
        target = self.targets[index]
        return data, target

    def __len__(self):
        return self.data.shape[0]

trainDataset = CreditFraudDataset(train_features, train_labels, shuffle=True)
valDataset = CreditFraudDataset(val_features, val_labels)
testDataset = CreditFraudDataset(test_features, test_labels)

sampler = DualSampler(trainDataset, BATCH_SIZE, sampling_rate=sampling_rate)
trainloader = torch.utils.data.DataLoader(trainDataset, batch_size=BATCH_SIZE, sampler=sampler, shuffle=False, num_workers=1, pin_memory=True)
valloader = torch.utils.data.DataLoader(valDataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1,  pin_memory=False)
testloader = torch.utils.data.DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1,  pin_memory=False)

# **Creating models & AUC Optimizer**

In [32]:
import torch 
from torch import nn
import torch.nn.functional as F

# Multilayer Perceptron
class MLP(torch.nn.Module):
    def __init__(self, input_dim=29, hidden_sizes=16, num_classes=1):
        super().__init__()
        self.hidden_sizes = hidden_sizes
        self.layers = torch.nn.Linear(input_dim, hidden_sizes)
        self.dropout = nn.Dropout(p=0.5)
        self.classifer = torch.nn.Linear(hidden_sizes, num_classes)

    def forward(self, x):
        """forward pass"""
        x = self.layers(x)
        x = F.relu(x)
        x = self.dropout(x)
        return self.classifer(x) 
  

In [33]:
set_all_seeds(SEED)
model = MLP(input_dim=29, hidden_sizes=16, num_classes=1) 
model = model.cuda()
print (model)

loss_fn = AUCMLoss()
optimizer = PESG(model, 
                 loss_fn=loss_fn,
                 lr=lr, 
                 margin=margin, 
                 epoch_decay=epoch_decay, 
                 weight_decay=weight_decay)

MLP(
  (layers): Linear(in_features=29, out_features=16, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (classifer): Linear(in_features=16, out_features=1, bias=True)
)


# **Training**

In [34]:
def evaluate(data, model):
     model.eval()
     pred = []
     true = [] 
     for j, data in enumerate(data):
         data, targets = data
         data = data.cuda()
         y_pred = model(data)
         pred.append(y_pred.cpu().detach().numpy())
         true.append(targets.numpy())
     true = np.concatenate(true)
     pred = np.concatenate(pred)
     val_auc =  roc_auc_score(true, pred) 
     return val_auc

In [35]:
print ('Start Training')
print ('-'*30)
for epoch in range(100):
     if epoch == 50 or epoch==75:
         # decrease learning rate by 10x & update regularizer
         optimizer.update_regularizer(decay_factor=10)
  
     train_pred = []
     train_true = []
     model.train()  
     for data, targets in trainloader:
         data, targets  = data.cuda(), targets.cuda()
         y_pred = model(data)
         #y_prob = torch.sigmoid(y_pred) # options: sigmoid, l2, none
         loss = loss_fn(y_pred, targets)
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         train_pred.append(y_pred.cpu().detach().numpy())
         train_true.append(targets.cpu().detach().numpy())
     
     train_true = np.concatenate(train_true)
     train_pred = np.concatenate(train_pred)
     train_auc = roc_auc_score(train_true, train_pred) 

     val_auc = evaluate(valloader, model)
     test_auc = evaluate(testloader, model)

     # print results
     print("epoch: {}, train_loss: {:4f}, train_auc:{:4f}, val_auc:{:4f}, test_auc:{:4f},  lr:{:4f}".format(epoch, loss.item(), train_auc, val_auc, test_auc, optimizer.lr ))

Start Training
------------------------------
epoch: 0, train_loss: 0.027446, train_auc:0.871964, val_auc:0.946429, test_auc:0.940484,  lr:0.100000
epoch: 1, train_loss: 0.027886, train_auc:0.943964, val_auc:0.957875, test_auc:0.948275,  lr:0.100000
epoch: 2, train_loss: 0.022244, train_auc:0.954968, val_auc:0.964605, test_auc:0.955106,  lr:0.100000
epoch: 3, train_loss: 0.023176, train_auc:0.960363, val_auc:0.968141, test_auc:0.961279,  lr:0.100000
epoch: 4, train_loss: 0.021818, train_auc:0.963592, val_auc:0.970230, test_auc:0.965589,  lr:0.100000
epoch: 5, train_loss: 0.022460, train_auc:0.967376, val_auc:0.971485, test_auc:0.969102,  lr:0.100000
epoch: 6, train_loss: 0.020066, train_auc:0.966668, val_auc:0.972433, test_auc:0.972133,  lr:0.100000
epoch: 7, train_loss: 0.019405, train_auc:0.968375, val_auc:0.973234, test_auc:0.974988,  lr:0.100000
epoch: 8, train_loss: 0.018477, train_auc:0.970607, val_auc:0.973970, test_auc:0.977620,  lr:0.100000
epoch: 9, train_loss: 0.019867, trai