In [None]:
# date: 04.29.24

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18,resnet50,vgg16
import numpy as np

# 1. Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=2)

num_classes = 10

def one_hot_encode(labels, num_classes):
    '''
    Convert an array of labels to a one-hot encoded matrix.

    Args:
    - labels (Tensor): A 1D tensor containing the labels.
    - num_classes (int): The number of classes.

    Returns:
    - Tensor: A matrix where each row is the one-hot encoded version of the corresponding label.
    '''
    # Create an empty tensor filled with zeros
    device = labels.device  # Get the device of the labels tensor
    one_hot = torch.zeros(labels.size(0), num_classes, device=device)
    # one_hot = torch.zeros(labels.size(0), num_classes)

    # Fill the locations corresponding to the labels with ones
    one_hot.scatter_(1, labels.unsqueeze(1), 1)

    return one_hot

# 2. Feature extraction using pretrained model
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        model = resnet18(pretrained=True)
        self.features = nn.Sequential(*list(model.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return x

def l2_regularization(model):
    l2_reg = 0.0
    for param in model.parameters():
        l2_reg += torch.norm(param, p=2)
    return l2_reg

#  def additional_regularization(b, w):
    # return torch.dot(b, w.view(-1))

def normalize_columns(tensor):
    """
    Normalize the columns of the input tensor such that the diagonal entries
    of tensor.T @ tensor are 1.
    """
    norms = tensor.norm(p=2, dim=0)  # Compute L2-norm for each column
    return tensor / norms

def bound_norm(data, upper_bound):
    norms = data.norm(dim=1, keepdim=True)
    scale = upper_bound / norms
    scale[scale > 1] = 1  # Only scale vectors with norm > upper_bound
    return data * scale

feature_extractor = FeatureExtractor().cuda()
feat_dim = 512

# Extract features for entire training set
train_features = []
train_labels_list = []
for inputs, labels in trainloader:
    inputs = inputs.cuda()
    features = feature_extractor(inputs).detach().cpu()
    train_features.append(features)
    train_labels_list.append(labels)

train_features = torch.cat(train_features, dim=0)
train_labels = torch.cat(train_labels_list, dim=0)

print(train_features.shape)

# train_features =  normalize_columns(train_features)
train_features =  bound_norm(train_features,1)

# train_feature_mean = train_features.mean(dim=0, keepdim=True)
# train_feature_variance = train_features.var(dim=0, keepdim=True)

# print(train_feature_mean.shape)
# # print(train_feature_variance.shape)

# train_features = (train_features-train_feature_mean)/train_feature_variance

# 3. Linear classifier without bias
class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim, bias=False)

    def forward(self, x):
        return self.fc(x)

classifier = Classifier(feat_dim, num_classes).cuda()

# 4. Quadratic loss
# def quadratic_loss(outputs, targets, classifier, mu):
#     return torch.mean(0.5*(outputs - targets)**2 + 0.5*mu*torch.norm((classifier.fc.weight))**2)

def quadratic_loss(outputs, targets):
    return torch.mean(0.5*(outputs - targets)**2)

optimizer = optim.Adam(classifier.parameters(), lr=0.001)


classifier_r = Classifier(feat_dim, num_classes).cuda()
optimizer_r = optim.Adam(classifier_r.parameters(), lr=0.001)

mu = 0.000001
psi = 0.01

# b = torch.from_numpy(np.random.exponential(size=512)).float().cuda()

# 5. Train the classifier using the pre-extracted features
for epoch in range(10):
    for i in range(0, len(train_features), 32):
        inputs = train_features[i:i+32].cuda()
        labels = train_labels[i:i+32].cuda()
        # labels_onehot = torch.zeros(labels.size(0), num_classes).cuda().scatter_(1, labels.view(-1, 1), 1)
        labels_onehot = one_hot_encode(labels.cuda(), num_classes)

        outputs = classifier(inputs)
        # loss = quadratic_loss(outputs, labels_onehot, classifier, mu) + psi* additional_regularization(b, classifier.fc.weight)
        loss = quadratic_loss(outputs, labels_onehot)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 3200 == 0:  # Print training loss every 100 batches
            print(f"Epoch {epoch + 1}, Batch {i // 32 + 1}, Loss: {loss.item()}")

# 6. Evaluate the classifier on the test set using pre-extracted features
classifier.eval()
correct = 0
total = 0

test_features = []
test_labels_list = []
for inputs, labels in testloader:
    inputs = inputs.cuda()
    features = feature_extractor(inputs).detach().cpu()
    test_features.append(features)
    test_labels_list.append(labels)

test_features = torch.cat(test_features, dim=0)
test_labels = torch.cat(test_labels_list, dim=0)

# test_features = (test_features-train_feature_mean)/train_feature_variance


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:13<00:00, 12885175.64it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 160MB/s]
  self.pid = os.fork()


torch.Size([50000, 512])
Epoch 1, Batch 1, Loss: 0.05045027285814285
Epoch 1, Batch 101, Loss: 0.03767799213528633
Epoch 1, Batch 201, Loss: 0.036080148071050644
Epoch 1, Batch 301, Loss: 0.031091976910829544
Epoch 1, Batch 401, Loss: 0.03137672692537308
Epoch 1, Batch 501, Loss: 0.0308985598385334
Epoch 1, Batch 601, Loss: 0.02739039622247219
Epoch 1, Batch 701, Loss: 0.026187393814325333
Epoch 1, Batch 801, Loss: 0.03371585160493851
Epoch 1, Batch 901, Loss: 0.025045275688171387
Epoch 1, Batch 1001, Loss: 0.028023332357406616
Epoch 1, Batch 1101, Loss: 0.029338065534830093
Epoch 1, Batch 1201, Loss: 0.029069507494568825
Epoch 1, Batch 1301, Loss: 0.026306474581360817
Epoch 1, Batch 1401, Loss: 0.026881495490670204
Epoch 1, Batch 1501, Loss: 0.025451797991991043
Epoch 2, Batch 1, Loss: 0.02591143548488617
Epoch 2, Batch 101, Loss: 0.023996198549866676
Epoch 2, Batch 201, Loss: 0.029319990426301956
Epoch 2, Batch 301, Loss: 0.024861324578523636
Epoch 2, Batch 401, Loss: 0.0256155021488

In [None]:
print(train_features.size())
print(type(train_features))

torch.Size([50000, 512])
<class 'torch.Tensor'>


In [None]:
from torch.utils.data import DataLoader, random_split
# from torch.utils.data import Dataset, Subset

# remain_size = int(0.6 * len(trainset))
# forget_size = len(trainset) - remain_size


forget_class = 5

forget_indices = torch.where(train_labels == forget_class)[0]

# # Split the dataset into two parts: class 5 and the remaining classes
# remaining_indices = torch.where(train_labels != forget_class)[0]

# forget_indices = torch.tensor(forget_indices)
# remaining_indices = torch.tensor(remaining_indices)


# print(trainset)
# print(forget_indices)

# remaining_features, forget_features =  random_split(train_features, [remain_size, forget_size])
# remaining_indices = remaining_features.indices
# remaining_labels = train_labels[remaining_indices]
# forget_indices = forget_features.indices
# forget_labels = train_labels[forget_indices]
# print(type(remaining_features))

num_forget_samples = int(0.8 * len(forget_indices))

# Randomly select indices for the forget set
forget_indices_sub = torch.randperm(len(forget_indices))[:num_forget_samples]

# Remaining indices for class 5 (10%)
remaining_indices_class5 = forget_indices[list(set(range(len(forget_indices))) - set(forget_indices_sub))]
# remaining_indices_class5 = list(set(range(len(forget_indices))) - set(forget_indices_sub))

forget_indices = forget_indices[forget_indices_sub]



# Remaining indices for other classes
remaining_indices_other_classes = torch.where(train_labels != forget_class)[0]

# Concatenate indices for the remaining set
remaining_indices = torch.cat((remaining_indices_class5, remaining_indices_other_classes))


forget_features = train_features[forget_indices]
remaining_features = train_features[remaining_indices]

forget_labels = train_labels[forget_indices]
remaining_labels = train_labels[remaining_indices]


tensors_rf = [remaining_features[i] for i in range(len(remaining_features))]
tensors_rl = [remaining_labels[i] for i in range(len(remaining_labels))]
remaining_features = torch.stack(tensors_rf)
remaining_labels = torch.stack(tensors_rl)
print(type(remaining_labels))
print(remaining_labels.size())

tensors_ff = [forget_features[i] for i in range(len(forget_features))]
tensors_fl = [forget_labels[i] for i in range(len(forget_labels))]
forget_features = torch.stack(tensors_ff)
forget_labels = torch.stack(tensors_fl)

<class 'torch.Tensor'>
torch.Size([50000])


In [None]:
def extract_features(model, features):
    model.eval()
    out = []
    labels = []
    with torch.no_grad():
      for i in range(0, len(features), 32):
        inputs = features[i:i+32].cuda()
        outputs = model(inputs)
        # print(outputs)
        # outputs.append(outputs.cpu().numpy())
        out.append(outputs.cpu().numpy())
    return out

class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.fc = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.fc(x)

def train_binary_classifier(model, D_r_features, D_r_labels, D_test_features, D_test_labels):
    # Create binary labels for binary classification
    D_r_binary_labels = np.ones_like(D_r_labels)
    D_test_binary_labels = np.zeros_like(D_test_labels)

    # Use model outputs on D_r as features for training
    # binary_train_features = D_r_features
    # binary_train_labels = D_r_binary_labels

    binary_train_features = np.vstack((D_r_features, D_test_features))
    binary_train_labels = np.concatenate((D_r_binary_labels, D_test_binary_labels)).flatten()

    # Move binary classifier and data to GPU
    binary_train_features_tensor = torch.tensor(binary_train_features, dtype=torch.float32).cuda()
    binary_train_labels_tensor = torch.tensor(binary_train_labels, dtype=torch.float32).cuda()

    # Initialize binary classifier
    binary_classifier = BinaryClassifier(binary_train_features.shape[1]).cuda()

    # Define binary cross-entropy loss and optimizer
    binary_criterion = nn.BCEWithLogitsLoss()
    binary_optimizer = optim.SGD(binary_classifier.parameters(), lr=0.001, momentum=0.9)

    # Training loop for binary classifier
    num_binary_epochs = 50  # You can adjust the number of epochs
    for epoch in range(num_binary_epochs):
        binary_classifier.train()
        binary_optimizer.zero_grad()
        binary_outputs = binary_classifier(binary_train_features_tensor)
        binary_loss = binary_criterion(binary_outputs.squeeze(dim=1), binary_train_labels_tensor)
        binary_loss.backward()
        binary_optimizer.step()

    return binary_classifier, binary_train_features_tensor, binary_train_labels_tensor

In [None]:
# for i in range(0, len(forget_features)):
forget_inputs = forget_features.cuda()
pred_forget_labels = forget_labels.cuda()
        # labels_onehot = torch.zeros(labels.size(0), num_classes).cuda().scatter_(1, labels.view(-1, 1), 1)
forget_labels_onehot = one_hot_encode(forget_labels.cuda(), num_classes)
forget_outputs = classifier(forget_inputs)
# loss_f = quadratic_loss(forget_outputs, forget_labels_onehot, classifier, mu) + psi* additional_regularization(b, classifier.fc.weight)
loss_f = quadratic_loss(forget_outputs, forget_labels_onehot)
print(loss_f)

tensor(0.0247, device='cuda:0', grad_fn=<MeanBackward0>)


In [None]:
classifier.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(test_features), 32):
        inputs = test_features[i:i+32].cuda()
        labels = test_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the test set using original model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(remaining_features), 32):
        inputs = remaining_features[i:i+32].cuda()
        labels = remaining_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the remaining train set using original model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(forget_features), 32):
        inputs = forget_features[i:i+32].cuda()
        labels = forget_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the forget train set using original model: %d %%' % (100 * correct / total))

Accuracy on the test set using original model: 77 %
Accuracy on the remaining train set using original model: 77 %
Accuracy on the forget train set using original model: 73 %


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
import random

def cm_score(estimator, X, y):
    y_pred = estimator.predict(X)
    cnf_matrix = confusion_matrix(y, y_pred)

    FP = cnf_matrix[0][1]
    FN = cnf_matrix[1][0]
    TP = cnf_matrix[0][0]
    TN = cnf_matrix[1][1]


    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)

    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    print (f"FPR:{FPR:.2f}, FNR:{FNR:.2f}, FP{FP:.2f}, TN{TN:.2f}, TP{TP:.2f}, FN{FN:.2f}")
    return ACC

def evaluate_attack_model(sample_loss,
                          members,
                          n_splits = 100,
                          random_state = None):
  """Computes the cross-validation score of a membership inference attack.
  Args:
    sample_loss : array_like of shape (n,).
      objective function evaluated on n samples.
    members : array_like of shape (n,),
      whether a sample was used for training.
    n_splits: int
      number of splits to use in the cross-validation.
    random_state: int, RandomState instance or None, default=None
      random state to use in cross-validation splitting.
  Returns:
    score : array_like of size (n_splits,)
  """

  unique_members = np.unique(members)
  if not np.all(unique_members == np.array([0, 1])):
    raise ValueError("members should only have 0 and 1s")

  attack_model = LogisticRegression()
  cv = StratifiedShuffleSplit(
      n_splits=n_splits, random_state=random_state)
  return cross_val_score(attack_model, sample_loss, members, cv=cv, scoring=cm_score)


def membership_inference_attack(model, test_f, test_l, forget_f, forget_l, seed):

  fgt_cls = list(np.unique(forget_l))
  indices = [i in fgt_cls for i in test_l]
  test_f = test_f[indices]
  test_l = test_l[indices]
  cr = nn.CrossEntropyLoss(reduction='none')

  test_losses = []
  forget_losses = []
  bs=128
  with torch.no_grad():
    for i in range(0, len(test_f), bs):
        inputs = test_f[i:i+bs].cuda()
        labels = test_l[i:i+bs].cuda()
        labels_onehot = one_hot_encode(labels.cuda(), 10)
        outputs = model(inputs)
        loss = cr(outputs, labels_onehot)
        # print(loss)
        # loss = quadratic_loss(outputs, labels_onehot)
        test_losses = test_losses + list(loss.cpu().detach().numpy())
        # test_losses.append(loss.cpu().detach().numpy())
    for i in range(0, len(forget_f), bs):
        inputs = forget_f[i:i+bs].cuda()
        labels = forget_l[i:i+bs].cuda()
        labels_onehot = one_hot_encode(labels.cuda(), 10)
        outputs = model(inputs)
        loss = cr(outputs, labels_onehot)
        # print(loss)
        # loss = quadratic_loss(outputs, labels_onehot)
        forget_losses = forget_losses + list(loss.cpu().detach().numpy())
        # forget_losses.append(loss.cpu().detach().numpy())

  np.random.seed(seed)
  random.seed(seed)
  if len(forget_losses) > len(test_losses):
      forget_losses = list(random.sample(forget_losses, len(test_losses)))
  elif len(test_losses) > len(forget_losses):
      test_losses = list(random.sample(test_losses, len(forget_losses)))

  t_labels = [0]*len(test_losses)
  f_labels = [1]*len(forget_losses)
  features = np.array(test_losses + forget_losses).reshape(-1,1)
  labels = np.array(t_labels + f_labels).reshape(-1)
  # features = np.clip(features, -100, 100)
  score = evaluate_attack_model(features, labels, n_splits=5, random_state=seed)

  return score



score = membership_inference_attack(classifier, test_features, test_labels, forget_features, forget_labels, 2023)

print(np.mean(score))












FPR:0.22, FNR:0.16, FP24.00, TN86.00, TP76.00, FN14.00
FPR:0.14, FNR:0.11, FP14.00, TN89.00, TP86.00, FN11.00
FPR:0.20, FNR:0.11, FP23.00, TN90.00, TP77.00, FN10.00
FPR:0.14, FNR:0.08, FP15.00, TN93.00, TP85.00, FN7.00
FPR:0.23, FNR:0.05, FP29.00, TN96.00, TP71.00, FN4.00
0.849


In [None]:
import numpy as np

model_MI = classifier

remaining_activations = extract_features(model_MI, remaining_features)
remaining_activations = np.vstack(remaining_activations)

test_activations = extract_features(model_MI, test_features)
test_activations = np.vstack(test_activations)

forget_activations = extract_features(model_MI, forget_features)
forget_activations = np.vstack(forget_activations)

train_activations = extract_features(model_MI, train_features)
train_activations = np.vstack(train_activations)


# binary_classifier, binary_train_features_tensor, binary_train_labels_tensor = train_binary_classifier(model_MI, remaining_activations, remaining_labels, test_activations, test_labels)
# binary_classifier, binary_train_features_tensor, binary_train_labels_tensor = train_binary_classifier(model_MI, forget_activations, forget_labels, test_activations, test_labels)

binary_classifier, binary_train_features_tensor, binary_train_labels_tensor = train_binary_classifier(model_MI, train_activations, train_labels, test_activations, test_labels)

from sklearn import svm

binary_model = svm.SVC(kernel='linear')

binary_model.fit(binary_train_features_tensor.cpu(), binary_train_labels_tensor.cpu())

binary_test_features_tensor = torch.tensor(forget_activations, dtype=torch.float32).cuda()
binary_test_outputs = binary_classifier(binary_test_features_tensor)
# binary_test_outputs = binary_model.predict(binary_test_features_tensor.cpu())
binary_predictions = (torch.sigmoid(torch.Tensor(binary_test_outputs)) > 0.5).cpu().numpy().astype(np.int)

# Evaluate the results
accuracy_binary = np.mean(binary_predictions == 1)  # Assuming D_f is correctly classified as 1
# accuracy_binary = np.mean(binary_test_outputs == 1)
print(f"Original attack success on D_f: {accuracy_binary}")

In [None]:
print(binary_train_labels_tensor)

In [None]:


for epoch in range(20):
    for i in range(0, len(remaining_features), 32):
        inputs = remaining_features[i:i+32].cuda()
        r_labels = remaining_labels[i:i+32].cuda()
        # labels_onehot = torch.zeros(r_labels.size(0), num_classes).cuda().scatter_(1, r_labels.view(-1, 1), 1)
        labels_onehot_r = one_hot_encode(r_labels.cuda(), num_classes)

        outputs = classifier_r(inputs)
        # loss = quadratic_loss(outputs, labels_onehot_r, classifier, mu) + psi* additional_regularization(b, classifier.fc.weight)
        loss = quadratic_loss(outputs, labels_onehot_r)

        optimizer_r.zero_grad()
        loss.backward()
        optimizer_r.step()

        if i % 3200 == 0:  # Print training loss every 100 batches
            print(f"Epoch {epoch + 1}, Batch {i // 32 + 1}, Loss: {loss.item()}")

Epoch 1, Batch 1, Loss: 0.05118412524461746
Epoch 1, Batch 101, Loss: 0.0001047474579536356
Epoch 1, Batch 201, Loss: 0.0386105440557003
Epoch 1, Batch 301, Loss: 0.03160007670521736
Epoch 1, Batch 401, Loss: 0.027521003037691116
Epoch 1, Batch 501, Loss: 0.02831871621310711
Epoch 1, Batch 601, Loss: 0.025489067658782005
Epoch 1, Batch 701, Loss: 0.02509658969938755
Epoch 1, Batch 801, Loss: 0.026556620374321938
Epoch 1, Batch 901, Loss: 0.026451706886291504
Epoch 1, Batch 1001, Loss: 0.02388654090464115
Epoch 1, Batch 1101, Loss: 0.02197069302201271
Epoch 1, Batch 1201, Loss: 0.023602019995450974
Epoch 1, Batch 1301, Loss: 0.024699201807379723
Epoch 1, Batch 1401, Loss: 0.02371923439204693
Epoch 1, Batch 1501, Loss: 0.024827508255839348
Epoch 2, Batch 1, Loss: 0.0670236349105835
Epoch 2, Batch 101, Loss: 0.005145589355379343
Epoch 2, Batch 201, Loss: 0.02523031271994114
Epoch 2, Batch 301, Loss: 0.025309771299362183
Epoch 2, Batch 401, Loss: 0.022235265001654625
Epoch 2, Batch 501, Lo

In [None]:
classifier_r.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(test_features), 32):
        inputs = test_features[i:i+32].cuda()
        labels = test_labels[i:i+32].cuda()

        outputs = classifier_r(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the test set using remaining set retrain model: %d %%' % (100 * correct / total))


classifier_r.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(remaining_features), 32):
        inputs = remaining_features[i:i+32].cuda()
        labels = remaining_labels[i:i+32].cuda()

        outputs = classifier_r(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the remaining train set using remaining set retrain model: %d %%' % (100 * correct / total))


classifier_r.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(forget_features), 32):
        inputs = forget_features[i:i+32].cuda()
        labels = forget_labels[i:i+32].cuda()

        outputs = classifier_r(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the forget train set using remaining set retrain model: %d %%' % (100 * correct / total))

Accuracy on the test set using remaining set retrain model: 71 %
Accuracy on the remaining train set using remaining set retrain model: 72 %
Accuracy on the forget train set using remaining set retrain model: 0 %


In [None]:
score = membership_inference_attack(classifier_r, test_features, test_labels, forget_features, forget_labels, 2023)

print(np.mean(score))

FPR:0.00, FNR:0.00, FP0.00, TN100.00, TP100.00, FN0.00
FPR:0.00, FNR:0.00, FP0.00, TN100.00, TP100.00, FN0.00
FPR:0.00, FNR:0.00, FP0.00, TN100.00, TP100.00, FN0.00
FPR:0.00, FNR:0.00, FP0.00, TN100.00, TP100.00, FN0.00
FPR:0.00, FNR:0.00, FP0.00, TN100.00, TP100.00, FN0.00
1.0


In [None]:
model_MI = classifier_r

# remaining_activations = extract_features(model_MI, remaining_features)
# remaining_activations = np.vstack(remaining_activations)

# test_activations = extract_features(model_MI, test_features)
# test_activations = np.vstack(test_activations)

forget_activations = extract_features(model_MI, forget_features)
forget_activations = np.vstack(forget_activations)


# binary_classifier = train_binary_classifier(model_MI, remaining_activations, remaining_labels, test_activations, test_labels)

binary_test_features_tensor = torch.tensor(forget_activations, dtype=torch.float32).cuda()
binary_test_outputs = binary_classifier(binary_test_features_tensor)
# binary_test_outputs = binary_model.predict(binary_test_features_tensor.cpu())
binary_predictions = (torch.sigmoid(torch.Tensor(binary_test_outputs)) > 0.5).cpu().numpy().astype(np.int)

# Evaluate the results
accuracy_binary = np.mean(binary_predictions == 1)  # Assuming D_f is correctly classified as 1
# accuracy_binary = np.mean(binary_test_outputs == 1)
print(f"retrain attack success on D_f: {accuracy_binary}")

In [None]:
for name, param in classifier.named_parameters():
  print(name)
  if name == 'fc.weight':
    W = param.data
  # else:
  #   b = param
  print(f'Parameter {name}, shape {param.shape}')
  print(param.data.size())

# W = param.data.T
print(W.shape)
print(type(W))

fc.weight
Parameter fc.weight, shape torch.Size([10, 512])
torch.Size([10, 512])
torch.Size([10, 512])
<class 'torch.Tensor'>


In [None]:
H_r = remaining_features.T@remaining_features/len(train_features)
# H_r = remaining_features.T@remaining_features/len(remaining_features)
H_f = forget_features.T@forget_features/len(train_features)
# H_f = forget_features.T@forget_features/len(forget_features)
H = train_features.T@train_features/len(train_features)
print(H_r)
print(H_f)
print(H)
# print(H_f)

tensor([[0.0018, 0.0012, 0.0011,  ..., 0.0012, 0.0014, 0.0011],
        [0.0012, 0.0019, 0.0013,  ..., 0.0013, 0.0015, 0.0012],
        [0.0011, 0.0013, 0.0018,  ..., 0.0012, 0.0013, 0.0011],
        ...,
        [0.0012, 0.0013, 0.0012,  ..., 0.0020, 0.0014, 0.0012],
        [0.0014, 0.0015, 0.0013,  ..., 0.0014, 0.0025, 0.0014],
        [0.0011, 0.0012, 0.0011,  ..., 0.0012, 0.0014, 0.0019]])
tensor([[3.5598e-04, 1.4816e-04, 1.5738e-04,  ..., 1.1561e-04, 2.0425e-04,
         1.3400e-04],
        [1.4816e-04, 1.5064e-04, 1.1564e-04,  ..., 7.1538e-05, 1.2019e-04,
         8.6385e-05],
        [1.5738e-04, 1.1564e-04, 1.6808e-04,  ..., 7.9275e-05, 1.2939e-04,
         9.1316e-05],
        ...,
        [1.1561e-04, 7.1538e-05, 7.9275e-05,  ..., 9.0230e-05, 8.7953e-05,
         7.0204e-05],
        [2.0425e-04, 1.2019e-04, 1.2939e-04,  ..., 8.7953e-05, 2.0897e-04,
         1.0375e-04],
        [1.3400e-04, 8.6385e-05, 9.1316e-05,  ..., 7.0204e-05, 1.0375e-04,
         1.2707e-04]])
tensor

In [None]:
# forget_labels_onehot = torch.zeros(forget_labels.size(0), num_classes).cuda().scatter_(1, labels.view(-1, 1), 1)
forget_labels_onehot = one_hot_encode(forget_labels.cuda(), num_classes)


# if len(forget_labels.shape) == 1:
#    forget_labels = forget_labels.unsqueeze(1)

# forget_labels_onehot = torch.zeros(forget_labels.size(0), num_classes)
# forget_labels_onehot.cuda().scatter_(1, forget_labels.cuda(), 1)

print(forget_labels[16])
print(forget_labels_onehot[16])

forget_features = torch.tensor(forget_features).cuda()
print(type(forget_features))
print(type(forget_labels_onehot))
print(type(W))
grad_f = (forget_features.T@(forget_features@W.T-forget_labels_onehot.cuda())/len(train_features))
# grad_f = (forget_features.T@(forget_features@W.T-forget_labels_onehot.cuda())/len(forget_features))
print(grad_f.shape)


tensor(5)
tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
torch.Size([512, 10])


  forget_features = torch.tensor(forget_features).cuda()


In [None]:
grad_f = torch.tensor(grad_f)
hess_grad = torch.linalg.inv(H_r).cuda()@grad_f

# hess_grad = len(remaining_features)*torch.linalg.inv(H).cuda()@grad_f/len(train_features)
# hess_grad = torch.linalg.inv(H).cuda()@grad_f
W_new = W.T+hess_grad.cuda()
# print(W_new)
pretrained_dict = classifier.state_dict()
pretrained_dict['fc.weight'] = W_new.T
classifier.load_state_dict(pretrained_dict)


  grad_f = torch.tensor(grad_f)


<All keys matched successfully>

In [None]:
classifier.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(test_features), 32):
        inputs = test_features[i:i+32].cuda()
        labels = test_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the test set using actual scrubbed model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(remaining_features), 32):
        inputs = remaining_features[i:i+32].cuda()
        labels = remaining_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the remaining train set using actual scrubbed model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(forget_features), 32):
        inputs = forget_features[i:i+32].cuda()
        labels = forget_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the forget train set using actual scrubbed model: %d %%' % (100 * correct / total))

Accuracy on the test set using actual scrubbed model: 75 %
Accuracy on the remaining train set using actual scrubbed model: 76 %
Accuracy on the forget train set using actual scrubbed model: 47 %


In [None]:
score = membership_inference_attack(classifier, test_features, test_labels, forget_features, forget_labels, 2023)

print(np.mean(score))

FPR:0.38, FNR:0.00, FP61.00, TN100.00, TP39.00, FN0.00
FPR:0.39, FNR:0.00, FP63.00, TN100.00, TP37.00, FN0.00
FPR:0.37, FNR:0.00, FP59.00, TN100.00, TP41.00, FN0.00
FPR:0.38, FNR:0.00, FP60.00, TN100.00, TP40.00, FN0.00
FPR:0.30, FNR:0.00, FP43.00, TN100.00, TP57.00, FN0.00
0.7140000000000001


In [None]:
model_MI = classifier

# remaining_activations = extract_features(model_MI, remaining_features)
# remaining_activations = np.vstack(remaining_activations)

# test_activations = extract_features(model_MI, test_features)
# test_activations = np.vstack(test_activations)

forget_activations = extract_features(model_MI, forget_features)
forget_activations = np.vstack(forget_activations)


# binary_classifier = train_binary_classifier(model_MI, remaining_activations, remaining_labels, test_activations, test_labels)

binary_test_features_tensor = torch.tensor(forget_activations, dtype=torch.float32).cuda()
binary_test_outputs = binary_classifier(binary_test_features_tensor)
# binary_test_outputs = binary_model.predict(binary_test_features_tensor.cpu())
binary_predictions = (torch.sigmoid(torch.Tensor(binary_test_outputs)) > 0.5).cpu().numpy().astype(np.int)

# Evaluate the results
accuracy_binary = np.mean(binary_predictions == 1)  # Assuming D_f is correctly classified as 1
# accuracy_binary = np.mean(binary_test_outputs == 1)
print(f"Original scrubbed attack success on D_f: {accuracy_binary}")

NameError: name 'binary_classifier' is not defined

In [None]:
# if len(train_labels.shape) == 1:
#     train_labels = train_labels.unsqueeze(1)

# train_labels_onehot = torch.zeros(train_labels.size(0), num_classes)
# train_labels_onehot.scatter_(1, train_labels, 1)
train_labels_onehot = one_hot_encode(train_labels.cuda(), num_classes)
# print(one_hot.shape)  # torch.Size([50000, 10])
# train_labels_onehot = torch.zeros(train_labels.size(0), 10).cuda().scatter_(1, labels.view(-1, 1), 1)
print(train_labels.shape)
Y_train = train_labels_onehot.T@train_labels_onehot/len(train_features)
Y = Y_train.cpu().numpy()
print(Y_train)
W = W.cpu().numpy()

torch.Size([50000])
tensor([[0.1000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.1000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.1000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        

In [None]:
# import cvxpy as cp
# import numpy as np

# # Create constant matrices
# # C = np.array([[2, -1], [-1, 3]])
# # A1 = np.array([[1, 0], [0, 0]])
# # A2 = np.array([[0, 0], [0, 1]])
# d=512
# lambda_max_value = d
# # Define variables
# X = cp.Variable((d, d), symmetric=True)

# # Define constraints
# constraints = [X >> 0]
# constraints += [X >> H_f.cpu().numpy()]
# # constraints += [X - lambda_max_value * torch.eye(d).numpy() << 0]
# # constraints +=[cp.trace(X) <= 1]
# constraints +=[cp.trace(X) >= 0]
# # constraints += [X[i, i] == 1 for i in range(d)]
# #constraints += [cp.trace(w@X.T@X@w.T) == len(y_train)]

# # Define objective
# # objective = cp.Minimize(cp.trace((w@X@w.T-Y_train)))

# objective = cp.Minimize(cp.trace((W@X@W.T/len(train_features)-Y)))
# # objective = cp.Minimize(cp.trace((Y_train-w@X.T@y_train)))

# # objective = cp.Minimize(cp.norm(W@X@W.T/len(train_dataset)-Y, 'fro')**2)

# # Define problem
# problem = cp.Problem(objective, constraints)

# # Solve problem
# problem.solve(qcp = True)
# # problem.solve()

# print("Optimal value: ", problem.value)
# print("Optimal variable X: ")
# # print(X.value)
# err = X.value-H_f.cpu().numpy()-H_r.cpu().numpy()
# # print(X.value)
# print(np.sqrt(np.trace(np.dot(err.T,err)))/d)





In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import cvxpy as cp
# import numpy as np

true_hessian = H
dim = 512
# Perturbations and observed gradient changes
num_perturbations = 500
delta_w = torch.randn(num_perturbations, dim) * 0.01  # Small perturbations
delta_L = []

# optimal_w = next(model.parameters()).detach().clone()
for dw in delta_w:
    # print(W.shape)
    # dw[np.newaxis, :]
    dw = np.reshape(dw, (1, 512))
    # print(dw.shape)
    W_noisy = W + dw.numpy()
    # print(W_noisy.shape)
    W_noisy = torch.tensor(W_noisy)
    # pretrained_dict = model.state_dict()
    # pretrained_dict['fc.weight'] = W_noisy
    # model.load_state_dict(pretrained_dict)
    classifier.fc.weight.data =  W_noisy.cuda()
    # outputs = model(X_train)
    # loss_perturbed = criterion(outputs, y_train.float())
    outputs = classifier(forget_features)
    # print(outputs.shape)
    # print(forget_labels.shape)
    # loss_perturbed = quadratic_loss(outputs, forget_labels_onehot, classifier, mu) + psi* additional_regularization(b, classifier.fc.weight)
    loss_perturbed = quadratic_loss(outputs, forget_labels_onehot)
    # delta_L.append(loss_perturbed.item() - loss.item())
    delta_L.append(loss_perturbed.item() - loss_f.item())

delta_L = np.array(delta_L)
delta_L = delta_L.reshape(-1, 1)

# CVXPY problem to estimate Hessian
d = dim
X = cp.Variable((d, d), symmetric=True)

# Create a list to hold our quadratic forms for each perturbation
delta_w_matrix = np.stack([dw.numpy() for dw in delta_w])

# Calculate the quadratic forms more efficiently
quadratic_forms_vectorized = 0.5 * cp.sum(cp.multiply(delta_w_matrix @ X, delta_w_matrix), axis=1)

objective = cp.Minimize(cp.sum_squares(delta_L.flatten() - quadratic_forms_vectorized))

# Assuming the same constraints
# constraints = [X >> 0, cp.trace(X) <= 1, cp.trace(X) >= 0, X >> H_f]
constraints = [X >> 0, cp.trace(X) >= 0, X >> H_f]
# constraints += [H[i, i] <= 0.25 for i in range(d)]
prob = cp.Problem(objective, constraints)
prob.solve()

# Estimated Hessian
H_value = X.value

# Compute difference between true and estimated Hessian
hessian_diff = np.linalg.norm(H - H_value, 'fro')

print("True Hessian:")
print(true_hessian)
print("\nEstimated Hessian:")
print(H_value)

print("\nDifference (Frobenius norm):", hessian_diff/d**2)

True Hessian:
tensor([[0.0018, 0.0012, 0.0011,  ..., 0.0012, 0.0014, 0.0011],
        [0.0012, 0.0019, 0.0013,  ..., 0.0013, 0.0015, 0.0012],
        [0.0011, 0.0013, 0.0018,  ..., 0.0012, 0.0013, 0.0011],
        ...,
        [0.0012, 0.0013, 0.0012,  ..., 0.0020, 0.0014, 0.0012],
        [0.0014, 0.0015, 0.0013,  ..., 0.0014, 0.0025, 0.0014],
        [0.0011, 0.0012, 0.0011,  ..., 0.0012, 0.0014, 0.0019]])

Estimated Hessian:
[[ 0.34028893  0.02220291  0.02177161 ...  0.01165315  0.01164646
   0.01131745]
 [ 0.02220291  0.33128896  0.02959077 ...  0.01495659 -0.00177099
   0.01300776]
 [ 0.02177161  0.02959077  0.31818157 ...  0.00868902  0.00576731
   0.01626491]
 ...
 [ 0.01165315  0.01495659  0.00868902 ...  0.31652271 -0.00684098
   0.01006389]
 [ 0.01164646 -0.00177099  0.00576731 ... -0.00684098  0.32839508
  -0.00221488]
 [ 0.01131745  0.01300776  0.01626491 ...  0.01006389 -0.00221488
   0.31349571]]

Difference (Frobenius norm): 3.240337849929511e-05


In [None]:
print(X.value)

[[ 0.34028893  0.02220291  0.02177161 ...  0.01165315  0.01164646
   0.01131745]
 [ 0.02220291  0.33128896  0.02959077 ...  0.01495659 -0.00177099
   0.01300776]
 [ 0.02177161  0.02959077  0.31818157 ...  0.00868902  0.00576731
   0.01626491]
 ...
 [ 0.01165315  0.01495659  0.00868902 ...  0.31652271 -0.00684098
   0.01006389]
 [ 0.01164646 -0.00177099  0.00576731 ... -0.00684098  0.32839508
  -0.00221488]
 [ 0.01131745  0.01300776  0.01626491 ...  0.01006389 -0.00221488
   0.31349571]]


In [None]:
from torch.autograd import Variable
# H_r_aprx = X.value/len(train_features) - H_f.cpu().numpy()
H_r_aprx = X.value - H_f.cpu().numpy()
H_r_aprx = Variable(torch.Tensor(H_r_aprx).float()).cuda()
H_aprx = Variable(torch.Tensor(X.value).float()).cuda()
hess_grad_aprx = torch.linalg.inv(H_r_aprx)@grad_f

# hess_grad_aprx = len(remaining_features)*torch.linalg.inv(H_aprx)@grad_f/len(train_features)
# hess_grad_aprx = torch.linalg.inv(H_aprx)@grad_f
W = Variable(torch.Tensor(W).float()).cuda()
print(type(hess_grad_aprx))
print(type(W))
W_new_aprx = W.T + hess_grad_aprx.cuda()
print(type(W_new_aprx))
pretrained_dict = classifier.state_dict()
pretrained_dict['fc.weight'] = W_new_aprx.T
classifier.load_state_dict(pretrained_dict)

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


<All keys matched successfully>

In [None]:
classifier.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(test_features), 32):
        inputs = test_features[i:i+32].cuda()
        labels = test_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the test set using approximate scrubbed model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(remaining_features), 32):
        inputs = remaining_features[i:i+32].cuda()
        labels = remaining_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the remaining train set using approximate scrubbed model: %d %%' % (100 * correct / total))


classifier.eval()
correct = 0
total = 0



with torch.no_grad():
    for i in range(0, len(forget_features), 32):
        inputs = forget_features[i:i+32].cuda()
        labels = forget_labels[i:i+32].cuda()

        outputs = classifier(inputs)
        _, predicted = outputs.max(1)

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print('Accuracy on the forget train set using approximate scrubbed model: %d %%' % (100 * correct / total))

Accuracy on the test set using approximate scrubbed model: 75 %
Accuracy on the remaining train set using approximate scrubbed model: 76 %
Accuracy on the forget train set using approximate scrubbed model: 46 %


In [None]:
score = membership_inference_attack(classifier, test_features, test_labels, forget_features, forget_labels, 2023)

print(np.mean(score))

FPR:0.37, FNR:0.00, FP58.00, TN100.00, TP42.00, FN0.00
FPR:0.38, FNR:0.00, FP60.00, TN100.00, TP40.00, FN0.00
FPR:0.36, FNR:0.00, FP57.00, TN100.00, TP43.00, FN0.00
FPR:0.37, FNR:0.00, FP58.00, TN100.00, TP42.00, FN0.00
FPR:0.30, FNR:0.00, FP42.00, TN100.00, TP58.00, FN0.00
0.725


In [None]:
model_MI = classifier

# remaining_activations = extract_features(model_MI, remaining_features)
# remaining_activations = np.vstack(remaining_activations)

# test_activations = extract_features(model_MI, test_features)
# test_activations = np.vstack(test_activations)

forget_activations = extract_features(model_MI, forget_features)
forget_activations = np.vstack(forget_activations)


# binary_classifier = train_binary_classifier(model_MI, remaining_activations, remaining_labels, test_activations, test_labels)

binary_test_features_tensor = torch.tensor(forget_activations, dtype=torch.float32).cuda()
binary_test_outputs = binary_classifier(binary_test_features_tensor)
# binary_test_outputs = binary_model.predict(binary_test_features_tensor.cpu())
binary_predictions = (torch.sigmoid(torch.tensor(binary_test_outputs)) > 0.5).cpu().numpy().astype(np.int)

# Evaluate the results
accuracy_binary = np.mean(binary_predictions == 1)  # Assuming D_f is correctly classified as 1
# accuracy_binary = np.mean(binary_test_outputs == 1)
print(f"Approx scrubbed attack success on D_f: {accuracy_binary}")

NameError: name 'binary_classifier' is not defined