In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

import torch
torch.manual_seed(0)
import torch.nn as nn
import tensorflow as tf
from torch.utils.data import Dataset, DataLoader

import syft as sy
from uuid import UUID
from uuid import uuid4


<torch._C.Generator at 0x7fcd0ecaf590>

In [86]:
from src.psi.util import Client, Server
from src.utils import add_ids
from src.utils.data_utils import id_collate_fn

# Create datasets

In [87]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df1 = pd.read_csv(fn)

In [88]:
df1.shape

(9822, 87)

In [89]:
categorical_cols = ['MOSTYPE','MOSHOOFD']
df = pd.get_dummies(df1, columns = categorical_cols)

In [90]:
train = df[df['ORIGIN']=='train']
val = df[df['ORIGIN']=='test']

_ = train.pop('ORIGIN')
_ = val.pop('ORIGIN')

X_train = train
X_val = val
y_train = train.pop('CARAVAN')
y_val = val.pop('CARAVAN')

In [91]:
X_train.shape, y_train.shape

((5822, 133), (5822,))

In [92]:
X_val.shape, y_val.shape

((4000, 133), (4000,))

In [93]:
X_train = np.array(X_train)
y_train = np.array(y_train)

X_val = np.array(X_val)
y_val = np.array(y_val)

In [94]:
# TODO ?
scaler = StandardScaler() # change?
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [95]:
class dataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.length = self.x.shape[0]
    
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return self.length


In [96]:
dataset_train = dataset(X_train, y_train)
dataset_val = dataset(X_val, y_val)
train_dim = X_train.shape[1]

In [97]:
# DataLoader
# batch_size to tune
dataloader_train = DataLoader(dataset_train, batch_size=512, shuffle=False)
dataloader_val = DataLoader(dataset_val, batch_size=512, shuffle=False)

# Model

In [98]:
class IntactModel(torch.nn.Module):
    """ 
    Model for the Intact dataset
    
    Attributes
    ----------
    dim: 
        Dimensionality of Intact Data
    Methods
    -------
    forward(x):
        Performs a forward pass through the Intact Model
    """
    def __init__(self, input_dim): 
        super(IntactModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    
    def forward(self, feat):
        pred = self.layers(feat)
        return pred

# Initialize and Configure Models

In [99]:
def get_classes_weights(class1_size, class2_size):
    if class1_size < class2_size:
        return [class2_size / class1_size, 1]
        factor2 = 1
    else:
        return [1, class1_size / class2_size]

In [100]:
# device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [101]:
# check relative weights of classes
class1_size = df[df['CARAVAN'] == 1].shape[0]
class0_size = df[df['CARAVAN'] == 0].shape[0]
print(class1_size, class0_size)
weights = get_classes_weights(class1_size, class0_size)
print(weights)
weights = torch.tensor(weights[0]).to(device)

586 9236
[15.761092150170649, 1]


In [102]:
model = IntactModel(train_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=.001,  betas=(0.9, 0.999))
criterion = nn.BCELoss(weight=weights)

# Training

In [103]:
def train_step(dataloader, model, optmizer, criterion):
    running_loss = 0
    for X, y in dataloader:
       
#        print(y.shape)
#        print(y.unsqueeze(1))
        X, y = X.to(device), y.to(device)
        
        # Zero our grads
        optimizer.zero_grad()
    
        # Make a prediction
        pred = model(X)

        # Figure out how much we missed by
        loss = criterion(pred, y.unsqueeze(1))

        # Backprop the loss on the end layer
        loss.backward()
    
        # Change the weights
        optimizer.step()
        
        # Accumulate Loss
        running_loss += loss.item()

    return running_loss

In [104]:
def val_step(dataloader, model, optimizer, criterion):
    running_loss = 0
    exs = 0 
    correct = 0
    aucs = []
    f1s = []
    for (X, y) in dataloader:
        # Send data and labels to machine model is on
        y  = y.to(device)
        X = X.to(device)     
   
        # Make a prediction
        with torch.no_grad():
            pred = model.forward(X).squeeze()
        
        #Calcualte Loss
        loss = criterion(pred, y)
        
        # Put back on cpu
        pred = pred.cpu().float()
        y = y.cpu().int()

        #Calculate AUC
        thresh_pred = (pred > .5).float()
        thresh_pred = thresh_pred.int()

        # Fix Me: Undefined for batches with all-same labels...
        auc = roc_auc_score(y, pred)
        f1 = f1_score(y, thresh_pred)

        # Calculate Accuracy Components
        num_exs = X.shape[0]
        num_correct = torch.sum(thresh_pred == y).item()

        # Accumulate loss, accuracy and auc
        exs += num_exs
        correct += num_correct
        running_loss += loss.item()
        aucs.append(auc)
        f1s.append(f1)

    auc = np.mean(np.array(aucs))
    f1 = np.mean(np.array(f1s))
    accuracy = correct / exs

    return f1, accuracy, running_loss

In [106]:
metric_names = ["Train Loss", "Validation Loss", "Accuracy", "F1"]
metrics = {metric:[] for metric in metric_names}
epochs = 3

# Train Loop
for i in range(epochs):

    # Train Step
    model.train()
    train_loss = train_step(dataloader_train, model, optimizer, criterion)

    # Train Step
    model.eval()
    f1, accuracy, val_loss = val_step(dataloader_val, model, optimizer, criterion)
    
    # Log metrics
    print(f"Epoch: {i} \t F1: {f1}")
    metrics["Train Loss"].append(train_loss)
    metrics["Validation Loss"].append(val_loss)
    metrics["Accuracy"].append(accuracy)
    metrics["F1"].append(f1)

IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

Epoch: 0 	 F1: 0.0


IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

Epoch: 1 	 F1: 0.0


IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

IntactModel(
  (layers): Sequential(
    (0): Linear(in_features=133, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

Epoch: 2 	 F1: 0.0
