In [1]:
# Import necessary libraries
import torch
import pandas as pd
from torch.nn import functional as F
from torch import nn, optim
from opacus import PrivacyEngine
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
import os
# Importing train, test split library
from sklearn.model_selection import train_test_split

In [2]:
baseline_file_path = "../../../data/data_baseline.csv"
data_baseline = pd.read_csv(baseline_file_path)

In [3]:
# Load the data
X_train = pd.read_csv('../../../data/X_train.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

In [4]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the training data using the scaler
X_train_scaled = scaler.fit_transform(X_train.drop(['id'], axis=1))
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.drop(['id'], axis=1).columns, index=X_train.index)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test.drop(['id'], axis=1))
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.drop(['id'], axis=1).columns, index=X_test.index)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5748, 25), (1437, 25), (5748, 2), (1437, 2))

In [6]:
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape

((5748, 24), (1437, 24), (5748, 2), (1437, 2))

In [7]:
y_train.head()

Unnamed: 0.1,Unnamed: 0,two_year_recid
0,5375,1
1,3605,0
2,29,1
3,2420,1
4,5641,1


In [8]:
# check type of y_train
type(y_train)

pandas.core.frame.DataFrame

In [9]:
y_train.set_index('Unnamed: 0', inplace=True)
y_train = y_train.squeeze()

y_test.set_index('Unnamed: 0', inplace=True)
y_test = y_test.squeeze()

In [10]:
# Remove the name of the index
y_train.index.name = None
y_test.index.name = None

In [11]:
# check type of y_test
type(y_test)

pandas.core.series.Series

In [12]:
y_test.head()

3613    0
4233    0
904     0
5365    1
5705    0
Name: two_year_recid, dtype: int64

In [13]:
# Convert Pandas DataFrames to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [14]:
# Set the random seed for reproducibility
torch.manual_seed(10000)

# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2,hidden_size3,output_size):
        super(MLPModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.layer3 = nn.Linear(hidden_size2, hidden_size3)
        self.relu3 = nn.ReLU()
        self.output_layer = nn.Linear(hidden_size3, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.layer3(x)
        x = self.relu3(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

In [15]:
# Initialize the model, loss function, and optimizer
input_size = X_train_scaled.shape[1]
hidden_size1 = 8
hidden_size2 = 8
hidden_size3 = 4
output_size = 1

In [16]:
# Load the saved model
model_file_path = "../../../model/mlp_model.pth"

# Load the saved model
model = MLPModel(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
model.load_state_dict(torch.load(model_file_path))

<All keys matched successfully>

In [17]:
# Binary Cross Entropy Loss for binary classification
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
# Convert data to PyTorch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [19]:
def evaluate_model_accuracy(model, X_test_tensor, y_test_tensor):
    with torch.no_grad():
        model.eval()
        y_pred_prob = model(X_test_tensor)
        y_pred_test = (y_pred_prob >= 0.5).float()
        accuracy = (y_pred_test == y_test_tensor).float().mean()
    return accuracy

In [20]:
accuracy_baseline = evaluate_model_accuracy(model, X_test_tensor, y_test_tensor)
print(f'Test Accuracy of baseline model: {accuracy_baseline.item():.7f}')

Test Accuracy of baseline model: 0.6882394


In [21]:
model = MLPModel(input_size, hidden_size1, hidden_size2,hidden_size3, output_size)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [22]:
# Define PrivacyEngine parameters
DELTA = 1e-5
EPSILON = 8.0
EPOCHS = 10
MAX_GRAD_NORM = 1.0

In [23]:
# Convert data to PyTorch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [24]:
def train_model_with_privacy(model, criterion, optimizer, train_loader, num_epochs, DELTA, EPSILON, EPOCHS, MAX_GRAD_NORM):
    # Wrap the model with PrivacyEngine
    privacy_engine = PrivacyEngine()
    model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        target_delta=DELTA,
        target_epsilon=EPSILON, 
        epochs=EPOCHS,
        max_grad_norm=MAX_GRAD_NORM,
    )

    # Training loop
    for epoch in range(num_epochs):
        for X_batch, y_batch in train_loader:
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')

    return model

In [25]:
model = train_model_with_privacy(model, criterion, optimizer, train_loader, num_epochs=3,
                                 DELTA=DELTA, EPSILON=EPSILON, EPOCHS=EPOCHS, MAX_GRAD_NORM=MAX_GRAD_NORM)

  z = np.log((np.exp(t) + q - 1) / q)


Epoch [1/3], Loss: 0.6607841849327087
Epoch [2/3], Loss: 0.6228288412094116
Epoch [3/3], Loss: 0.623069167137146


In [26]:
accuracy_dp = evaluate_model_accuracy(model, X_test_tensor, y_test_tensor)
print(f'Test Accuracy with Differential Privacy: {accuracy_dp.item():.7f}')

Test Accuracy with Differential Privacy: 0.6736256


In [27]:
# Define new PrivacyEngine parameters for increased privacy
DELTA = 1e-5
EPSILON = 4.0  # Decrease epsilon for more privacy
EPOCHS = 10
MAX_GRAD_NORM = 0.5  # Decrease max_grad_norm for more privacy

In [28]:
model = MLPModel(input_size, hidden_size1, hidden_size2,hidden_size3, output_size)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
model2 = train_model_with_privacy(model, criterion, optimizer, train_loader, num_epochs=3,
                                 DELTA=DELTA, EPSILON=EPSILON, EPOCHS=EPOCHS, MAX_GRAD_NORM=MAX_GRAD_NORM)

Epoch [1/3], Loss: 0.7041042447090149
Epoch [2/3], Loss: 0.8140561580657959
Epoch [3/3], Loss: 0.716059148311615


In [29]:
accuracy_dp2 = evaluate_model_accuracy(model, X_test_tensor, y_test_tensor)
print(f'Test Accuracy with increased Differential Privacy: {accuracy_dp2.item():.7f}')

Test Accuracy with increased Differential Privacy: 0.5497564
