In [53]:
import os

import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

from syft.frameworks.torch.dp import pate


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.dataset import random_split

In [54]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [55]:
DATA_PATH = "/ssd003/projects/pets/datasets/home_credit"
data = pd.read_csv(f"{DATA_PATH}/train.csv")
labels = data.pop("target")

In [56]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TRAIN_PERC = .9
BATCH_SIZE = 1024
NUM_TEACHERS = 15
TEACHER_EPOCHS = 20
STUDENT_EPOCHS = 20

In [57]:
class HomeCredit(Dataset): 
    """Dataset for Vertical Federated Learning"""

    def __init__(self, data, labels):
        """
        Args:
            data (Numpy Array) : Numpy Array with Features
            labels (Numpy Array) : Numpy Array with Labels. None if not available. 
        """
        self.data = data
        self.labels = labels
    
    def __getitem__(self, idx):
        """Return record single record"""
        features = self.data[idx].astype(np.float32)
        label = self.labels[idx]

        return features, label
    
    def __len__(self):
        """Return Length"""
        return len(self.data)

In [58]:
def get_loaders(data, num_teachers, batch_size):
    """ 
    Function to create data loaders for the Teacher Class.
    
    :param data: Numpy Array of the data 
    :param num_teacher: Number of teacher models 
    :param batch_size: Batch size for the dataloaders

    :return: Return teacher loaders and student loader (with actual labels)
    """ 
    loaders = []
    sample_size = len(data) // (num_teachers + 1)

    for i in range(num_teachers):
        indices = list(range(i*sample_size, (i+1)*sample_size))
        subset_data = Subset(data, indices)
        loader = DataLoader(subset_data, batch_size=batch_size)
        loaders.append(loader)
    
    return loaders[1:], loaders[0]

In [59]:
def student_loader(student_train_loader, labels):
    """ 
    Function to modify the student loader to include labels from teacher
    
    :param student_train_loader: The student loader with actual labels 
    :param labels: Labels from the teacher model

    :return: Return iterator  
    """ 
    # Use teacher to label data (discard actual labels)
    for i, (data, _) in enumerate(iter(student_train_loader)):
        yield data, torch.from_numpy(labels[i*len(data): (i+1)*len(data)])

In [60]:
#EDA
# Load Data and Labels
print(len(data_df.columns), len(data_df))

105 307511


In [61]:
data_df.head(5)

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,target
0,-0.577538,0.142129,-0.478095,-0.166149,-0.507465,-0.149452,1.50688,-0.456215,0.379837,0.579154,...,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-0.5176655,1
1,-0.577538,0.426792,1.72545,0.592677,1.600698,-1.25275,-0.166821,-0.460115,1.078697,1.790855,...,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866,0
2,-0.577538,-0.427196,-1.152888,-1.404676,-1.092389,-0.783451,-0.689509,-0.453299,0.206116,0.306869,...,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866,0
3,-0.577538,-0.142533,-0.71143,0.177869,-0.653696,-0.928991,-0.680114,-0.473217,-1.375829,0.369143,...,-0.024402,-0.022529,-0.018305,3.3367200000000005e-17,0.0,-3.6450320000000003e-17,0.0,-7.516682e-17,-3.831603e-16,0
4,-0.577538,-0.199466,-0.213734,-0.361755,-0.068772,0.56357,-0.892535,-0.47321,0.191639,-0.307263,...,-0.024402,-0.022529,-0.018305,-0.08210023,-0.067957,-0.1805048,-0.313873,-0.3594746,-1.092866,0


In [62]:
# lable balance 
label_bd = labels.value_counts()
print(label_bd[1]/sum(label_bd) * 100,"%")
label_bd

8.072881945686495 %


0    282686
1     24825
Name: target, dtype: int64

In [63]:
data = data.to_numpy(dtype=np.float32)
labels = labels.to_numpy(dtype=np.int)
dataset = HomeCredit(data=data, labels=labels)

# Get train and validation size
train_size = int(len(dataset) * TRAIN_PERC)
val_size = len(dataset) - train_size
train_data, val_data = random_split(dataset, [train_size, val_size])

# Define dataloaders
t_loaders, s_loader = get_loaders(train_data, NUM_TEACHERS, BATCH_SIZE) # Teacher loaders, student loader
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, drop_last=True) # Loader to validate in Train Ensemble and Train Student Model

In [77]:
s_loader

<torch.utils.data.dataloader.DataLoader at 0x7f1e6db54d30>

## MODELS


In [64]:
class HCModel(torch.nn.Module):
    """ 
    Model for Credit Bureau
    
    Attributes
    ----------
    feat_dim: 
        Dimensionality of Data
    Methods
    -------
    forward(x):
        Performs a forward pass through the Credit Bureau Model
    """
    def __init__(self, feat_dim): 
        super(HCModel, self).__init__()
        self.feat_dim = feat_dim
        self.layers = nn.Sequential(
            nn.Linear(self.feat_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
            nn.Softmax(dim=1)
        )
    
    def forward(self, feat):
        pred = self.layers(feat)
        return pred

In [66]:
# Initialize models and otptimizers for teacher ensembles
models = [HCModel(feat_dim=data.shape[1]) for i in range(NUM_TEACHERS)]
opts = [torch.optim.Adam(model.parameters(), lr=.001,  betas=(0.9, 0.999)) for model in models]