In [26]:
import os

import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

from syft.frameworks.torch.dp import pate


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.dataset import random_split

In [27]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [39]:
DATA_PATH = "/ssd003/projects/pets/datasets/home_credit"
data = pd.read_csv(f"{DATA_PATH}/train.csv")
labels = data.pop("target")
data = data.to_numpy(dtype=np.float32)
labels = labels.to_numpy(dtype=np.int)
dataset = HomeCredit(data=data, labels=labels)

# Get train and validation size
train_size = int(len(dataset) * TRAIN_PERC)
val_size = len(dataset) - train_size
train_data, val_data = random_split(dataset, [train_size, val_size])

# Define dataloaders
t_loaders, s_loader = get_loaders(train_data, NUM_TEACHERS, BATCH_SIZE) # Teacher loaders, student loader
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, drop_last=True) # Loader to validate in Train Ensemble and Train Student Model

In [40]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TRAIN_PERC = .9
BATCH_SIZE = 1024
NUM_TEACHERS = 15
TEACHER_EPOCHS = 20
STUDENT_EPOCHS = 20

In [41]:
class HomeCredit(Dataset): 
    """Dataset for Vertical Federated Learning"""

    def __init__(self, data, labels):
        """
        Args:
            data (Numpy Array) : Numpy Array with Features
            labels (Numpy Array) : Numpy Array with Labels. None if not available. 
        """
        self.data = data
        self.labels = labels
    
    def __getitem__(self, idx):
        """Return record single record"""
        features = self.data[idx].astype(np.float32)
        label = self.labels[idx]

        return features, label
    
    def __len__(self):
        """Return Length"""
        return len(self.data)

In [42]:
def get_loaders(data, num_teachers, batch_size):
    """ 
    Function to create data loaders for the Teacher Class.
    
    :param data: Numpy Array of the data 
    :param num_teacher: Number of teacher models 
    :param batch_size: Batch size for the dataloaders

    :return: Return teacher loaders and student loader (with actual labels)
    """ 
    loaders = []
    sample_size = len(data) // (num_teachers + 1)

    for i in range(num_teachers):
        indices = list(range(i*sample_size, (i+1)*sample_size))
        subset_data = Subset(data, indices)
        loader = DataLoader(subset_data, batch_size=batch_size)
        loaders.append(loader)
    
    return loaders[1:], loaders[0]

In [43]:
def student_loader(student_train_loader, labels):
    """ 
    Function to modify the student loader to include labels from teacher
    
    :param student_train_loader: The student loader with actual labels 
    :param labels: Labels from the teacher model

    :return: Return iterator  
    """ 
    # Use teacher to label data (discard actual labels)
    for i, (data, _) in enumerate(iter(student_train_loader)):
        yield data, torch.from_numpy(labels[i*len(data): (i+1)*len(data)])

In [44]:
#EDA
# Load Data and Labels
print(len(data_df.columns), len(data_df))

104 307511


In [45]:
data_df.head(5)

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,-0.577538,0.142129,-0.478095,-0.166149,-0.507465,-0.149452,1.50688,-0.456215,0.379837,0.579154,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-0.5176655
1,-0.577538,0.426792,1.72545,0.592677,1.600698,-1.25275,-0.166821,-0.460115,1.078697,1.790855,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866
2,-0.577538,-0.427196,-1.152888,-1.404676,-1.092389,-0.783451,-0.689509,-0.453299,0.206116,0.306869,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866
3,-0.577538,-0.142533,-0.71143,0.177869,-0.653696,-0.928991,-0.680114,-0.473217,-1.375829,0.369143,...,-0.090534,-0.024402,-0.022529,-0.018305,3.3367200000000005e-17,0.0,-3.6450320000000003e-17,0.0,-7.516682e-17,-3.831603e-16
4,-0.577538,-0.199466,-0.213734,-0.361755,-0.068772,0.56357,-0.892535,-0.47321,0.191639,-0.307263,...,-0.090534,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866


In [46]:
# lable balance 
label_bd = labels.value_counts()
print(label_bd[1]/sum(label_bd) * 100,"%")
label_bd

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [37]:
# Form np arrays of labels and features.
# train_labels = np.array(train_data.pop('target'))
# val_labels = np.array(val_data.pop('target'))

train_features = np.array(train_df)
val_features = np.array(val_df)


NameError: name 'train_df' is not defined

In [38]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)


NameError: name 'StandardScaler' is not defined

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

X_train_oversampled, y_oversampled = oversample.fit_resample(train_features, train_labels)

## MODELS


In [None]:
class HCModel(torch.nn.Module):
    """ 
    Model for Credit Bureau
    
    Attributes
    ----------
    feat_dim: 
        Dimensionality of Data
    Methods
    -------
    forward(x):
        Performs a forward pass through the Credit Bureau Model
    """
    def __init__(self, feat_dim): 
        super(HCModel, self).__init__()
        self.feat_dim = feat_dim
        self.layers = nn.Sequential(
            nn.Linear(self.feat_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
            nn.Softmax(dim=1)
        )
    
    def forward(self, feat):
        pred = self.layers(feat)
        return pred

In [None]:
# Initialize models and otptimizers for teacher ensembles
models = [HCModel(feat_dim=data.shape[1]) for i in range(NUM_TEACHERS)]
opts = [torch.optim.Adam(model.parameters(), lr=.001,  betas=(0.9, 0.999)) for model in models]

In [None]:
def train_models(num_teachers, models, opts, train_loaders, val_loader):
    """ 
    Train the teacher models on the the training data and assess on validation set
    
    :param num_teacher: Number of teacher models 
    :param models A list of teacher models 
    :param opts A list of optimizers
    :param train_loaders A list of train data loaders 
    :param val_loader A validation loader

    :return: Return A list of train and validation losses for each epoch
    """ 
    train_losses = [[] for i in range(num_teachers)]
    val_losses = [[] for i in range(num_teachers)]
    for epoch in range(TEACHER_EPOCHS):
        train_loss = train_step(models, opts, train_loaders)
        val_loss = val_step(models, val_loader)
        avg_train_loss = sum(train_loss) / len(train_loss)
        avg_val_loss = sum(val_loss) / len(val_loss)
        print(f"Epoch: {str(epoch)}\t AVG Train Loss: {str(avg_train_loss)}\t AVG Val Loss: {str(avg_val_loss)}")


        for i in range(num_teachers):
            train_losses[i].append(train_loss[i])
            val_losses[i].append(val_loss[i])

    return train_losses, val_losses

In [None]:
def train_step(models, opts, train_loaders):
    """ 
    Train teacher ensembles for a single epoch
    
    :param models A list of teacher models 
    :param opts A list of optimizers
    :param train_loaders A list of train data loaders 

    :return: Return A list of train and validation losses for each teacher for each epoch
    """ 
    train_running_losses = [0 for i in range(len(models))]
    for i, (model, opt, loader) in enumerate(zip(models, opts, train_loaders)):
            model = model.to(DEVICE)
            for feat, lbl in loader:
                feat, lbl = feat.to(DEVICE), lbl.to(DEVICE)
                model.zero_grad()
                out = model(feat)
                criterion = nn.CrossEntropyLoss()
                loss = criterion(out, lbl)

                loss.backward()
                opt.step()
                train_running_losses[i] += loss.detach().cpu().item()
    
    return train_running_losses

In [None]:
def val_step(models, loader):
    """ 
    Validation teacher ensembles for a single epoch
    
    :param models A list of teacher models 
    :param loader Validation dataloader

    :return: A list of validation losses
    """ 
    val_loss = []
    for i, model in enumerate(models):
        outputs, labels, _ = predict(model, loader)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(outputs, labels)
        val_loss.append(loss.cpu().item())

    return val_loss

In [None]:
def predict(model, loader):
    """ 
    Get predictions of single model on loader
    
    :param model A teacher model 
    :param loader A dataloader

    :return: output of the model, labels, index of predicted class
    """ 
    preds = torch.zeros(0, dtype=torch.long).to(DEVICE)
    labels = torch.zeros(0, dtype=torch.long).to(DEVICE)
    outputs = []
    model.to(DEVICE)
    model.eval()

    for feat, lbl in loader:
        feat, lbl = feat.to(DEVICE), lbl.to(DEVICE)
        with torch.no_grad():
            output = model(feat)
        outputs.append(output)
        ps = torch.argmax(torch.exp(output), dim=1)
        preds = torch.cat((preds, ps))
        labels = torch.cat((labels, lbl))
    outputs = torch.cat(outputs, dim=0)

    return outputs, labels, preds

In [None]:
def agg_teacher(models, loader, epsilon):
    """ 
    Get noisily aggregated prediction of teacher ensemble 
    
    :param models A list of teacher models 
    :param loader A dataloader
    :param epsilon A noise parameter for the laplace 

    :return: index of predictions, actual labels
    """ 
    preds = []
    for i, model in enumerate(models):
        _, _, pred = predict(model, loader)
        preds.append(pred.cpu().numpy())
    preds = np.stack(preds)

    labels = np.array([]).astype(int)
    for pred in np.transpose(preds):
        label_counts = np.bincount(pred, minlength=2)
        beta = 1 / epsilon

        for i in range(len(label_counts)):
            noise = np.random.laplace(0, beta, 1)
            label_counts[i] += noise

        new_label = np.argmax(label_counts)
        labels = np.append(labels, new_label)

    return preds, labels

In [None]:
def train_student(model, opt, train_loader, val_loader, epochs):
    """ 
    Train student on public dataset labelled with teach ensemble
    
    :param model Student model
    :param opt Optimizer for hte student model 
    :param train_loader Train dataloader
    :param val_loader Validation dataloader
    :epochs The number of epochs

    :return: list of train losses, list of validation losses, list of validation aucs
    """ 
    train_losses = []
    val_losses = []
    val_aucs = []

    for epoch in range(epochs):
        train_loss = train_student_step(model, opt, train_loader)
        val_loss, val_auc = val_student_step(model, val_loader)
        print(f"{str(epoch)}\t AVG Train Loss: {str(train_loss)}\t AVG Val Loss: {str(val_loss)} \t AVG AUC: {val_auc}")
 
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_aucs.append(val_auc)
        
    return train_losses, val_losses, val_aucs 


In [None]:
def train_student_step(model, opt, train_loader):
    """ 
    Train step on student model
    
    :param model Student model
    :param opt Optimizer for hte student model 
    :param train_loader Train dataloader


    :return: running loss from the step 
    """
    running_loss = 0
    model.to(DEVICE)
    model.train()
    for i, (feat, lbl) in enumerate(train_loader):
        feat, lbl = feat.to(DEVICE), lbl.to(DEVICE)
        model.zero_grad()
        out = model(feat)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(out, lbl)

        loss.backward()
        opt.step()

        running_loss += loss.detach().cpu().item()
    
    return running_loss

In [None]:
def val_student_step(model, val_loader):
    """ 
    Validation step on student model
    
    :param model Student model
    :param val_loader A validation load


    :return: Average loss , Average auc
    """
    losses = []
    aucs = []
    model.to(DEVICE)
    model.eval()

    for i, (feat, lbl) in enumerate(val_loader):
        feat, lbl = feat.to(DEVICE), lbl.to(DEVICE)
        with torch.no_grad():
            out = model(feat)

        criterion = nn.CrossEntropyLoss()
        loss = criterion(out, lbl)
        auc = roc_auc_score(lbl.cpu().numpy(), out.cpu().numpy()[:, 1])

        losses.append(loss.cpu().item())
        aucs.append(auc)

    avg_auc = sum(aucs) / len(aucs)
    avg_loss = sum(losses) / len(losses)
    return avg_loss, avg_auc

In [None]:
train_losses, val_losses = train_models(NUM_TEACHERS, models, opts, t_loaders, val_loader)

In [None]:
student_model = HCModel(feat_dim=104)
optimizer = torch.optim.Adam(student_model.parameters(), lr=0.003)

train_losses, val_losses, val_aucs  = train_student(student_model, optimizer, s_loader, val_loader, STUDENT_EPOCHS)

In [None]:
f, axarr = plt.subplots(1, 1, figsize=(10, 10))
axarr.plot(val_aucs)