In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

import torch
import torch.nn as nn
import tensorflow as tf
from torch.utils.data import Dataset, DataLoader

import syft as sy
from uuid import UUID
from uuid import uuid4



In [8]:
#!pip install src

In [9]:
from src.psi.util import Client, Server
from src.utils import add_ids
from src.utils.data_utils import id_collate_fn

ModuleNotFoundError: No module named 'src'

In [28]:
def get_classes_weights(class1_size, class2_size):
    if class1_size < class2_size:
        return [class2_size / class1_size, 1]
        factor2 = 1
    else:
        return [1, class1_size / class2_size]


In [29]:
class VerticalDataset(Dataset):
    """Dataset for Vertical Federated Learning"""

    def __init__(self, ids, data, labels=None):
        """
        Args:
            ids (Numpy Array) : Numpy Array with UUIDS
            data (Numpy Array) : Numpy Array with Features
            targets (Numpy Array) : Numpy Array with Labels. None if not available. 
        """
        self.ids = ids
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        """Return record single record"""
        feature = self.data[index].astype(np.float32)

        if self.labels is None:
            label  = None
        else:
            label = int(self.labels[index]) if self.labels is not None else None

        id = self.ids[index]

        # Return a tuple of non-None elements
        return (*filter(lambda x: x is not None, (feature, label, id)),)
    
    def get_ids(self):
        """Return a list of the ids of this dataset."""
        return [str(id_) for id_ in self.ids]
    
    def sort_by_ids(self):
        """
        Sort the dataset by IDs in ascending order
        """
        ids = self.get_ids()
        sorted_idxs = np.argsort(ids)


        self.data = self.data[sorted_idxs]

        if self.labels is not None:
            self.labels = self.labels[sorted_idxs]

        self.ids = self.ids[sorted_idxs]


In [30]:
class SinglePartitionDataLoader(DataLoader):
    """DataLoader for a single vertically-partitioned dataset"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.collate_fn = id_collate_fn


In [31]:
class VerticalDataLoader:
    """Dataloader which batches data from a complete
    set of vertically-partitioned datasets
    i.e. the images dataset AND the labels dataset
    """

    def __init__(self, data1, data2, *args, **kwargs):

        self.dataloader1 = SinglePartitionDataLoader(
            data1, *args, **kwargs
        )
        self.dataloader2 = SinglePartitionDataLoader(
            data2, *args, **kwargs
        )

    def __iter__(self):
        """
        Zip Dataloaders 
        """
        return zip(self.dataloader1, self.dataloader2)

    def __len__(self):
        """
        Return length of dataset
        """
        return (len(self.dataloader1) + len(self.dataloader2)) // 2

    def drop_non_intersecting(self, intersection):
        """Remove elements and ids in the datasets that are not in the intersection."""
        self.dataloader1.dataset.data = self.dataloader1.dataset.data[intersection]
        self.dataloader1.dataset.ids = self.dataloader1.dataset.ids[intersection]

        self.dataloader1.dataset.labels = self.dataloader1.dataset.labels[intersection]
        self.dataloader2.dataset.ids = self.dataloader2.dataset.ids[intersection]

    def sort_by_ids(self) -> None:
        """
        Sort each dataset by ids
        """
        self.dataloader1.dataset.sort_by_ids()
        self.dataloader2.dataset.sort_by_ids()

In [32]:
class CaravanModel(torch.nn.Module):
    """ 
    Model for the caravan dataset
    
    Attributes
    ----------
    dim: 
        Dimensionality of Caravan Data (in-house + vendor)
    Methods
    -------
    forward(x):
        Performs a forward pass through the Caravan Model
    """
    def __init__(self, house_dim, vendor_dim): 
        super(CaravanModel, self).__init__()
        self.fused_input_dim = house_dim + vendor_dim
        self.layers = nn.Sequential(
            nn.Linear(self.fused_input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    
    def forward(self, house_feat, vendor_feat):
        feat = torch.cat([house_feat, vendor_feat], dim=1)
        pred = self.layers(feat)
        return pred

In [33]:
class VendorModel(torch.nn.Module):
    """ 
    Model for Vendor variables
    
    Attributes
    ----------
    dim: 
        Dimensionality of the vendor data
    Methods
    -------
    forward(x):
        Performs a forward pass through the Credit Bureau Model
    """
    
    def __init__(self, cb_dim): 
        super(VendorModel, self).__init__()
        self.cb_dim = cb_dim
        self.layers = torch.nn.Sequential(
            nn.Linear(self.cb_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.Sigmoid()
        )
    
    def forward(self, cb_feat):
        pred = self.layers(cb_feat)
        return pred

In [34]:
class SplitNN:
    """
    A class representing SplitNN

    Attributes
    ----------
    caravan_model:  
        Model for Caravan Neural Network Module

    vendor_model:   
        Vendor Neural Network Module

    caravan_opt:  
        Optimizer for the Caravan Neural Network Module

    vendor_opt:   
        Optimizer for the Vendor Neural Network Module

    data: 
        A list storing intermediate computations at each index

    remote_tensors: 
        A list storing intermediate computations at each index (Computation from each model detached from global computation graph)

    Methods
    -------
    forward(x):
        Performs a forward pass through the SplitNN

    backward(): 
        Performs a backward pass through the SplitNN

    zero_grads():
        Zeros the gradients of all networks in SplitNN

    step():
        Updates the parameters of all networks in SplitNN
    """


    def __init__(self, caravan_model, vendor_model, caravan_opt, vendor_opt):
        self.caravan_model = caravan_model
        self.vendor_model = vendor_model
        self.caravan_opt = caravan_opt
        self.vendor_opt = vendor_opt
        self.data = []
        self.remote_tensors = []

    def forward(self, hc_x, cb_x):
        """
        Parameters
        ----------
        x:  
            Input Sample 
        """

        data = []
        remote_tensors = []

        # Forward pass through first model
        data.append(self.cb_model(cb_x))

        # if location of data is the same as location of the subsequent model
        if data[-1].location == self.hc_model.location:
            # store computation in remote tensor array 
            # Gradients will be only computed backward upto the point of detachment
            remote_tensors.append(data[-1].detach().requires_grad_())
        else:
            # else move data to location of subsequent model and store computation in remote tensor array 
            # Gradients will be only computed backward upto the point of detachment
            remote_tensors.append(
                data[-1].detach().move(self.hc_model.location).requires_grad_()
            )

        # Get and return final output of model
        data.append(self.hc_model(hc_x, remote_tensors[-1]))

        self.data = data 
        self.remote_tensors = remote_tensors
        return data[-1]

    def backward(self):
        # if location of data is the same as detatched data 
        if self.remote_tensors[0].location == self.data[0].location:
            # Store gradients from remote_tensor 
            grads = self.remote_tensors[0].grad.copy()
        else:
            # Move gradients to lovation of Store grad
            grads = self.remote_tensors[0].grad.copy().move(self.data[0].location)

        self.data[0].backward(grads)

    def zero_grads(self):
        """
        Parameters
        ----------
        """
        self.cb_opt.zero_grad()
        self.hc_opt.zero_grad()


    def step(self):
        """
        Parameters
        ----------
        """
        self.cb_opt.step()
        self.hc_opt.step()



## Create datasets

In [19]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df = pd.read_csv(fn)

In [20]:
df.shape

(9822, 87)

In [21]:
train = df[df['ORIGIN']=='train']
val = df[df['ORIGIN']=='test']

_ = train.pop('ORIGIN')
_ = val.pop('ORIGIN')

X_train = train
X_val = val
y_train = train.pop('CARAVAN')
y_val = val.pop('CARAVAN')

In [45]:
# get numerical columns
cat_cols = ['MOSTYPE','MOSHOOFD']
num_cols = list(X_train.columns.values[43:])

In [23]:
# convert dataframe to numpy arrays
# create 2 datasets: one categorial, one numerical
X_train_num = np.array(X_train[num_cols])
X_train_cat = np.array(X_train[cat_cols])
y_train = np.array(y_train)

X_val_num = np.array(X_val[cat_cols])
X_val_cat = np.array(X_val[cat_cols])
y_val = np.array(y_val)


In [24]:
X_train_num.shape, X_train_cat.shape, y_train.shape

((5822, 42), (5822, 2), (5822,))

In [25]:
X_val_num.shape, X_val_cat.shape, y_val.shape

((4000, 2), (4000, 2), (4000,))

In [26]:
# Get UID Column
uuids = np.array([uuid4() for _ in range(len(X_train))])
uuids_val = np.array([uuid4() for _ in range(len(X_val))])

In [35]:
# create VerticalDatasets
num_dataset = VerticalDataset(ids=uuids, data=X_train_num, labels=y_train)
cat_dataset = VerticalDataset(ids=uuids, data=X_train_cat, labels=y_train)

num_dataset_val = VerticalDataset(ids=uuids_val, data=X_val_num, labels=y_val)
cat_dataset_val = VerticalDataset(ids=uuids_val, data=X_val_cat, labels=y_val)

In [36]:
## Initialize Train Dataloader 
dataloader = VerticalDataLoader(num_dataset, cat_dataset, batch_size=512)

# Compute private set intersection
client_items = dataloader.dataloader1.dataset.get_ids()
server_items = dataloader.dataloader2.dataset.get_ids()
 
client = Client(client_items)
server = Server(server_items)

setup, response = server.process_request(client.request, len(client_items))
intersection = client.compute_intersection(setup, response)

# Order data
dataloader.drop_non_intersecting(intersection)
dataloader.sort_by_ids()

NameError: name 'id_collate_fn' is not defined

In [37]:
## Initialize Validation Dataloader 
val_dataloader = VerticalDataLoader(num_dataset_val, cat_dataset_val, batch_size=512)

# Compute private set intersection
val_client_items = val_dataloader.dataloader1.dataset.get_ids()
val_server_items = val_dataloader.dataloader2.dataset.get_ids()

val_client = Client(val_client_items)
val_server = Server(val_server_items)

val_setup, val_response = val_server.process_request(val_client.request, len(val_client_items))
val_intersection = val_client.compute_intersection(val_setup, val_response)

# Order data
val_dataloader.drop_non_intersecting(val_intersection)
val_dataloader.sort_by_ids()

NameError: name 'id_collate_fn' is not defined

## Training

In [38]:
# device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [42]:
# check relative weights of classes
class1_size = df[df['CARAVAN'] == 1].shape[0]
class0_size = df[df['CARAVAN'] == 0].shape[0]
print(class1_size, class0_size)
weights = get_classes_weights(class1_size, class0_size)
print(weights)
weights = torch.tensor(weights).to(device)
#criterion = nn.?(weight=weights)

586 9236
[15.761092150170649, 1]


In [48]:
# Training globals 
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Iniatialize Caravan numerical Model and Optimizer
caravan_model = CaravanModel(len(num_cols), len(cat_cols))
caravan_opt = torch.optim.Adam(caravan_model.parameters(), lr=.001,  betas=(0.9, 0.999))

# Iniatialize Credit Bureau Model and Optmizer
vendor_model = VendorModel(len(cat_cols))
vendor_opt = torch.optim.Adam(vendor_model.parameters(), lr=.001,  betas=(0.9, 0.999))

# Define Split Neural Network
splitNN = SplitNN(caravan_model, vendor_model, caravan_opt, vendor_opt)
criterion = torch.nn.BCELoss(weight=weights)