# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: LoRa
* Model: distilbert-base-uncased
* Evaluation approach: Comparing Accuracy
* Fine-tuning dataset: sms_spam

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
! pip install -q "datasets==2.15.0"
! pip install -U accelerate
! pip install -U transformers
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [41]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam

from datasets import load_dataset

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").select(range(100)).train_test_split(
    test_size=0.4, shuffle=True, seed=42
)

splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]

Dataset({
    features: ['sms', 'label'],
    num_rows: 60
})

In [42]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Map: 100%|██████████| 60/60 [00:00<00:00, 4142.32 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 3856.03 examples/s]


Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 60
})

In [43]:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

# Unfreeze all the model parameters.
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in model.parameters():
    param.required_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:

import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./original_spam_not_spam",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.662624,0.775


TrainOutput(global_step=4, training_loss=0.7166260480880737, metrics={'train_runtime': 2.4775, 'train_samples_per_second': 24.218, 'train_steps_per_second': 1.615, 'total_flos': 967632951120.0, 'train_loss': 0.7166260480880737, 'epoch': 1.0})

In [45]:
trainer.evaluate()
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [46]:
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

# Set the PEFT configuration
config = LoraConfig(
    r=8,  # Number of attention heads to replace
    lora_alpha=32,  # Scaling factor for the Lora layers
    #target_modules=["query", "key", "value", "dense"],
    target_modules=["q_lin", "v_lin", "k_lin", "out_lin"],
    inference_mode=False,
    lora_dropout=0.1,  # Dropout rate for the Lora layers
    task_type="SEQ_CLS",  # Type of task (e.g., CAUSAL_LM, SEQ2SEQ)
)


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

# Create the PEFT model
peft_model = get_peft_model(model, config)

# Print the trainable parameters
peft_model.print_trainable_parameters()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,042 || all params: 67,842,052 || trainable%: 1.307510568813573


In [47]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
peft_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./peft_spam_not_spam",
        # Set the learning rate
        learning_rate = 5e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the PEFT model
peft_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.559011,0.925


TrainOutput(global_step=4, training_loss=0.6328960657119751, metrics={'train_runtime': 0.95, 'train_samples_per_second': 63.157, 'train_steps_per_second': 4.21, 'total_flos': 987538173600.0, 'train_loss': 0.6328960657119751, 'epoch': 1.0})

In [48]:
peft_trainer.evaluate()

{'eval_loss': 0.5590114593505859,
 'eval_accuracy': 0.925,
 'eval_runtime': 0.0847,
 'eval_samples_per_second': 472.112,
 'eval_steps_per_second': 35.408,
 'epoch': 1.0}

In [49]:

# Save the PEFT model weights
peft_model.save_pretrained("./peft_model_weights")

In [50]:
peft_trainer.save_model("./peft_model")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [51]:
from peft import PeftConfig, PeftModel
# load pretrained model with peft
peft_model_path = "peft_model_weights"

config = PeftConfig.from_pretrained(peft_model_path) # load config
print(config)


LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='distilbert-base-uncased', revision=None, task_type='SEQ_CLS', inference_mode=True, r=8, target_modules={'v_lin', 'out_lin', 'k_lin', 'q_lin'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})


In [52]:
from peft import AutoPeftModelForSequenceClassification
lora_model = AutoPeftModelForSequenceClassification.from_pretrained(peft_model_path,  num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:

# Evaluate the PEFT model
peft_trainer.evaluate()

# Compare the results to the results from prior to fine-tuning
original_accuracy = trainer.state.log_history[-1]["eval_accuracy"]
peft_accuracy = peft_trainer.state.log_history[-1]["eval_accuracy"]
print(f"Original Accuracy: {original_accuracy}")
print(f"PEFT Accuracy: {peft_accuracy}")


Original Accuracy: 0.775
PEFT Accuracy: 0.925
