In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

import torch
import torch.nn as nn
import tensorflow as tf
from torch.utils.data import Dataset, DataLoader

import syft as sy
from uuid import UUID
from uuid import uuid4



In [64]:
from src.psi.util import Client, Server
from src.utils import add_ids
from src.utils.data_utils import id_collate_fn

ModuleNotFoundError: No module named 'src'

In [41]:
def get_classes_weights(class1_size, class2_size):
    if class1_size < class2_size:
        return [class2_size / class1_size, 1]
        factor2 = 1
    else:
        return [1, class1_size / class2_size]


In [59]:
class VerticalDataset(Dataset):
    """Dataset for Vertical Federated Learning"""

    def __init__(self, ids, data, labels=None):
        """
        Args:
            ids (Numpy Array) : Numpy Array with UUIDS
            data (Numpy Array) : Numpy Array with Features
            targets (Numpy Array) : Numpy Array with Labels. None if not available. 
        """
        self.ids = ids
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        """Return record single record"""
        feature = self.data[index].astype(np.float32)

        if self.labels is None:
            label  = None
        else:
            label = int(self.labels[index]) if self.labels is not None else None

        id = self.ids[index]

        # Return a tuple of non-None elements
        return (*filter(lambda x: x is not None, (feature, label, id)),)
    
    def get_ids(self):
        """Return a list of the ids of this dataset."""
        return [str(id_) for id_ in self.ids]
    
    def sort_by_ids(self):
        """
        Sort the dataset by IDs in ascending order
        """
        ids = self.get_ids()
        sorted_idxs = np.argsort(ids)


        self.data = self.data[sorted_idxs]

        if self.labels is not None:
            self.labels = self.labels[sorted_idxs]

        self.ids = self.ids[sorted_idxs]


In [60]:
class SinglePartitionDataLoader(DataLoader):
    """DataLoader for a single vertically-partitioned dataset"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.collate_fn = id_collate_fn


In [61]:
class VerticalDataLoader:
    """Dataloader which batches data from a complete
    set of vertically-partitioned datasets
    i.e. the images dataset AND the labels dataset
    """

    def __init__(self, data1, data2, *args, **kwargs):

        self.dataloader1 = SinglePartitionDataLoader(
            data1, *args, **kwargs
        )
        self.dataloader2 = SinglePartitionDataLoader(
            data2, *args, **kwargs
        )

    def __iter__(self):
        """
        Zip Dataloaders 
        """
        return zip(self.dataloader1, self.dataloader2)

    def __len__(self):
        """
        Return length of dataset
        """
        return (len(self.dataloader1) + len(self.dataloader2)) // 2

    def drop_non_intersecting(self, intersection):
        """Remove elements and ids in the datasets that are not in the intersection."""
        self.dataloader1.dataset.data = self.dataloader1.dataset.data[intersection]
        self.dataloader1.dataset.ids = self.dataloader1.dataset.ids[intersection]

        self.dataloader1.dataset.labels = self.dataloader1.dataset.labels[intersection]
        self.dataloader2.dataset.ids = self.dataloader2.dataset.ids[intersection]

    def sort_by_ids(self) -> None:
        """
        Sort each dataset by ids
        """
        self.dataloader1.dataset.sort_by_ids()
        self.dataloader2.dataset.sort_by_ids()

## Create datasets

In [2]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df = pd.read_csv(fn)

In [3]:
df.shape

(9822, 87)

In [17]:
train = df[df['ORIGIN']=='train']
val = df[df['ORIGIN']=='test']

_ = train.pop('ORIGIN')
_ = val.pop('ORIGIN')

X_train = train
X_val = val
y_train = train.pop('CARAVAN')
y_val = val.pop('CARAVAN')

In [50]:
# get numerical columns
cat_cols = ['MOSTYPE','MOSHOOFD']
num_cols = list(X_train.columns.values[43:])

In [51]:
# convert dataframe to numpy arrays
# create 2 datasets: one categorial, one numerical
X_train_num = np.array(X_train[num_cols])
X_train_cat = np.array(X_train[cat_cols])
y_train = np.array(y_train)

X_val_num = np.array(X_val[cat_cols])
X_val_cat = np.array(X_val[cat_cols])
y_val = np.array(y_val)


In [56]:
X_train_num.shape, X_train_cat.shape, y_train.shape

((5822, 42), (5822, 2), (5822,))

In [57]:
X_val_num.shape, X_val_cat.shape, y_val.shape

((4000, 2), (4000, 2), (4000,))

In [66]:
# Get UID Column
uuids = np.array([uuid4() for _ in range(len(X_train))])
uuids_val = np.array([uuid4() for _ in range(len(X_val))])

In [63]:
# create VerticalDatasets
num_dataset = VerticalDataset(ids=uuids, data=X_train_num, labels=y_train)
cat_dataset = VerticalDataset(ids=uuids, data=X_train_cat, labels=y_train)

num_dataset_val = VerticalDataset(ids=uuids_val, data=X_val_num, labels=y_val)
cat_dataset_val = VerticalDataset(ids=uuids_val, data=X_val_cat, labels=y_val)

In [65]:
## Initialize Train Dataloader 
dataloader = VerticalDataLoader(num_dataset, cat_dataset, batch_size=512)

# Compute private set intersection
client_items = dataloader.dataloader1.dataset.get_ids()
server_items = dataloader.dataloader2.dataset.get_ids()
 
client = Client(client_items)
server = Server(server_items)

setup, response = server.process_request(client.request, len(client_items))
intersection = client.compute_intersection(setup, response)

# Order data
dataloader.drop_non_intersecting(intersection)
dataloader.sort_by_ids()

NameError: name 'id_collate_fn' is not defined

In [None]:
## Initialize Validation Dataloader 
val_dataloader = VerticalDataLoader(num_dataset_val, cat_dataset_val, batch_size=512)

# Compute private set intersection
val_client_items = val_dataloader.dataloader1.dataset.get_ids()
val_server_items = val_dataloader.dataloader2.dataset.get_ids()

val_client = Client(val_client_items)
val_server = Server(val_server_items)

val_setup, val_response = val_server.process_request(val_client.request, len(val_client_items))
val_intersection = val_client.compute_intersection(val_setup, val_response)

# Order data
val_dataloader.drop_non_intersecting(val_intersection)
val_dataloader.sort_by_ids()

## Training

In [24]:
# device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [46]:
# check relative weights of classes
class1_size = df[df['CARAVAN'] == 1].shape[0]
class0_size = df[df['CARAVAN'] == 0].shape[0]
print(class1_size, class0_size)
weights = get_classes_weights(class1_size, class0_size)
print(weights)


586 9236
[15.761092150170649, 1]
