In [None]:
pip install transformers torch datasets peft evaluate opacus accelerate -U

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    LoraConfig,
)
from datasets import load_dataset
import evaluate
import torch
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch import nn

from opacus.privacy_engine import PrivacyEngine

from tqdm import tqdm

model_name_or_path = "prajjwal1/bert-tiny"
num_epochs = 5
lr = 0.01
batch_size = 1024

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="right")
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


def preprocess_data(examples):
    outputs = tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)
    return outputs

In [None]:
def load_cleaned_data(task):
    if task == 1:
        print("SST2 Dataset")

        task = "sst2"

        # Load SST-2 dataset
        dataset = load_dataset("glue", task)
        tokenized_data = dataset.map(preprocess_data, batched=True)
        tokenized_data = tokenized_data.remove_columns(["idx","sentence"])
        tokenized_data = tokenized_data.rename_column("label", "labels")
        tokenized_data.set_format("torch")
        return tokenized_data



    elif task == 2:
        print("QNLI Dataset")
    elif task == 3:
        print("MNLI Dataset")
    elif task == 4:
        print("QQP Dataset")
    else:
        print("Invalid Dataset")
        task = None


In [None]:
taskChoice = int(input("Enter \n1 for SST2, 2 for QNLI, 3 for MNLI and 4 for QQP: "))

tokenized_data = load_cleaned_data(taskChoice)

train_dataloader = DataLoader(tokenized_data['train'], shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(tokenized_data['validation'], shuffle=True, batch_size=batch_size)

Dataset is tokenized, cleaned and separated into training and validation

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",  r=8, lora_alpha=32, lora_dropout=0.1)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
lora_model = get_peft_model(model, peft_config)
lora_dp_model = get_peft_model(model, peft_config)

lora_model.print_trainable_parameters()
lora_dp_model.print_trainable_parameters()


LoRA model is configured via Hugging Face PEFT API. Its hyperparamters have been selected as given in the PEFT blog

In [None]:
# Define optimizer and loss function
optimizer = SGD(lora_model.parameters(), lr=lr)
dp_optimizer = SGD(lora_dp_model.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
def evaluate_model(model,eval_dataloader,task):
    metric = evaluate.load("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    return metric.compute()

###Without Differential Privacy

In [None]:
def trainModel(model,optimizer,train_dataloader,val_dataloader,loss_fn,lr_scheduler,tqdm,task,epochs=5,dp=False):

    for epoch in range(epochs):
        model.train()
        for step,batch in enumerate(tqdm(train_dataloader)):

            # Forward pass
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])

            # Backward pass and update without DP
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        # Evaluate on validation set
        with torch.no_grad():
            val_accuracy = evaluate_model(model, val_dataloader,task)
            print(f"Epoch {epoch+1}, Validation Accuracy {'with' if dp else 'without'} DP: {val_accuracy}")

    print("Training complete!")

In [None]:
trainModel(lora_model,optimizer,train_dataloader,val_dataloader,loss_fn,lr_scheduler,tqdm,"sst2")

###With Differential Privacy

In [None]:
privacy_engine = PrivacyEngine()

lora_dp_model,dp_optimizer,dataloader = privacy_engine.make_private_with_epsilon(
    module=lora_dp_model,
    optimizer=dp_optimizer,
    data_loader=train_dataloader,
    target_epsilon = 3,
    target_delta = 1/tokenized_data['train'].num_rows,
    epochs = 5,
    max_grad_norm=0.2,
)



In [None]:
import numpy as np
for epoch in range(5):
    lora_dp_model.train()
    losses = []
    for step,batch in enumerate(tqdm(dataloader)):

        dp_optimizer.zero_grad()
        # Forward pass
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = lora_dp_model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        # print(loss.item())
        # losses.append(loss.item())

        # Backward pass and update
        # print(loss)
        loss.backward()
        dp_optimizer.step()
        lr_scheduler.step()



    # train_epoch_loss = np.mean(losses)

    # print(f"{epoch=}: {train_epoch_loss=} ")

    # Evaluate on validation set
    with torch.no_grad():
        val_accuracy = evaluate_model(lora_dp_model, val_dataloader,"sst2")
        print(f"Epoch {epoch+1}, Validation Accuracy: {val_accuracy}")

print("Training complete")