# Lightweight Fine-Tuning Project

In this cell, we describe our choices for each of the following

* PEFT technique: [Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora)
* Model: [DistilBERT base model (uncased)](https://huggingface.co/distilbert/distilbert-base-uncased)
* Evaluation approach: There is no class imbalance in the chosen dataset so binary classification accuracy on the Test split was selected as the evaluation metric.
* Fine-tuning dataset: [imdb](https://huggingface.co/datasets/stanfordnlp/imdb)

## Loading and Evaluating a Foundation Model

In the cells below, we load our chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
# Import the datasets and transformers packages
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from datasets import load_dataset
from transformers import set_seed
seed = 21
set_seed(seed)
# Load the train and test splits of the imdb dataset
splits = ['train', 'test']
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}

# Thin out the dataset to make it run faster for this example
for split in splits:
    ds[split] = ds[split].shuffle(seed=seed).select(range(500))

# Show the dataset
ds

  from .autonotebook import tqdm as notebook_tqdm


{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 })}

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    """Preprocess the imdb dataset by returning tokenized examples."""
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)

# Show the first example of the tokenized training set
print(tokenized_ds["train"][0]["input_ids"])



[101, 2054, 2081, 1996, 2434, 6359, 12851, 4569, 2001, 2009, 2001, 2081, 2011, 2111, 2007, 2053, 5166, 2040, 2020, 2074, 2108, 11333, 17413, 2005, 1037, 3232, 1997, 2420, 1012, 1012, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2023, 2001, 2242, 2007, 1037, 5166, 1010, 2021, 2009, 2074, 2347, 1005, 1056, 2004, 2172, 4569, 1012, 2198, 2004, 7629, 1997, 5922, 2155, 4476, 2003, 2941, 2437, 2019, 3947, 2182, 2000, 2022, 21699, 1010, 2021, 2002, 2003, 3569, 2011, 20342, 5889, 1010, 10036, 2569, 3896, 1998, 4895, 11263, 10695, 2100, 18201, 2015, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 1012, 2852, 1012, 6080, 7389, 2063, 1006, 2004, 7629, 1007, 12976, 2013, 1037, 2413, 3827, 1998, 7288, 2002, 2003, 2183, 2000, 2404, 1037, 9811, 2121, 2006, 1996, 6106, 1997, 2605, 1012, 1012, 1012, 1996, 5394, 1010, 2010, 2413, 6513, 1998, 1996, 21025, 2480, 5302, 1011, 2066, 1000, 18001, 20856, 1000, 5630, 2027, 2024, 2183, 2000, 2644, 2032, 1012, 1012, 1012, 1026, 7987, 101

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},  # For converting predictions to strings
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)

# Freeze all the parameters of the base model
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in model.base_model.parameters():
    param.requires_grad = False
print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [4]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

training_args = TrainingArguments("test_trainer")
training_args.per_device_train_batch_size = 8
training_args.per_device_eval_batch_size = 8
training_args.evaluation_strategy = "epoch"
training_args.save_strategy = "epoch"
training_args.seed = seed
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 0.6928027272224426,
 'eval_accuracy': 0.526,
 'eval_runtime': 6.1431,
 'eval_samples_per_second': 81.392,
 'eval_steps_per_second': 10.255}

## Performing Parameter-Efficient Fine-Tuning

In the cells below, we create a 2 PEFT models from our loaded model by changing only the rank in LoRA configuration, run the training loops, and save the PEFT models' weights.

In [5]:
from peft import LoftQConfig, LoraConfig, get_peft_model, TaskType
# Define the LoRA Configuration
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=16, # Scaling Factor
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias="lora_only",
    task_type=TaskType.SEQ_CLS, # Seqence to Classification Task
    modules_to_save=["classifier"], # Ensure that classifier parameters are also trained and serialized
)
lora_model = get_peft_model(model, lora_config)
print(lora_model.print_trainable_parameters())

trainable params: 827,138 || all params: 67,768,324 || trainable%: 1.2205
None


In [6]:
lora_training_args = TrainingArguments(
        output_dir="./results/distilbert_lora",
        logging_dir="./logs/distilbert_lora",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        seed=seed
    )
trainer = Trainer(
    model=lora_model,
    args=lora_training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4999,0.477982,0.842
2,0.3039,0.377013,0.844
3,0.1013,0.497831,0.858
4,0.0399,0.682289,0.858
5,0.0098,0.721156,0.86


Checkpoint destination directory ./results/distilbert_lora/checkpoint-63 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora/checkpoint-126 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora/checkpoint-189 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora/checkpoint-252 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora/checkpoint-315 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=315, training_loss=0.19097421244969442, metrics={'train_runtime': 96.286, 'train_samples_per_second': 25.964, 'train_steps_per_second': 3.272, 'total_flos': 337414748160000.0, 'train_loss': 0.19097421244969442, 'epoch': 5.0})

In [7]:
lora_model.save_pretrained("distilbert_lora_rank8")
# trainer.save_model("distilbert_lora_rank8")
trainer.evaluate()

{'eval_loss': 0.3701898753643036,
 'eval_accuracy': 0.854,
 'eval_runtime': 6.2033,
 'eval_samples_per_second': 80.602,
 'eval_steps_per_second': 10.156,
 'epoch': 5.0}

In [11]:
# Define the LoRA Configuration
lora_config_r16 = LoraConfig(
    r=16, # Rank Number
    lora_alpha=16, # Scaling Factor
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias="lora_only",
    task_type=TaskType.SEQ_CLS, # Seqence to Classification Task
    modules_to_save=["classifier"], # Ensure that classifier parameters are also trained and serialized
)
lora_model_r16 = get_peft_model(model, lora_config_r16)
print(lora_model_r16.print_trainable_parameters())

trainable params: 1,048,322 || all params: 67,989,508 || trainable%: 1.5419
None


In [12]:
lora_training_args = TrainingArguments(
        output_dir="./results/distilbert_lora_r16",
        logging_dir="./logs/distilbert_lora_r16",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        seed=seed
    )
trainer = Trainer(
    model=lora_model_r16,
    args=lora_training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4659,0.397507,0.852
2,0.2694,0.319861,0.866
3,0.083,0.432665,0.892
4,0.0091,0.611161,0.87
5,0.0024,0.576023,0.87


Checkpoint destination directory ./results/distilbert_lora_r16/checkpoint-63 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora_r16/checkpoint-126 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora_r16/checkpoint-189 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora_r16/checkpoint-252 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/distilbert_lora_r16/checkpoint-315 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=315, training_loss=0.16595655268146878, metrics={'train_runtime': 96.3035, 'train_samples_per_second': 25.96, 'train_steps_per_second': 3.271, 'total_flos': 339113441280000.0, 'train_loss': 0.16595655268146878, 'epoch': 5.0})

In [13]:
lora_model_r16.save_pretrained("distilbert_lora_rank16")
# trainer.save_model("distilbert_lora_rank16")
trainer.evaluate()

{'eval_loss': 0.31921297311782837,
 'eval_accuracy': 0.876,
 'eval_runtime': 6.1645,
 'eval_samples_per_second': 81.11,
 'eval_steps_per_second': 10.22,
 'epoch': 5.0}

## QLoRA
QLoRA is for improving performance when training quantized models. 
That would entail finetune benchmarking with and without LoftQ initalization. 
Moreover, for LoftQ to work best, it is recommended to target as many layers (additional trainable parameters) with LoRA as possible. (https://huggingface.co/docs/peft/en/developer_guides/quantization)
Because of this additional computational load, such benchmarking was not performed.

## Performing Inference with a PEFT Model

In the cells below, we load the saved PEFT model weights and evaluate the performance of the trained PEFT models. These evaluations indicate that rank 8 LoRA performs slightly better than rank 16 LoRA for our case (keeping all other configurations the same).
The performance of both fine-tuned models is much higher (>80% accuracy) as compared to that of original model (52.6% accuracy).

In [8]:
from peft import AutoPeftModelForSequenceClassification
lora_model_trained = AutoPeftModelForSequenceClassification.from_pretrained("distilbert_lora_rank8")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments("test_trainer")
training_args.per_device_train_batch_size = 8
training_args.per_device_eval_batch_size = 8
training_args.evaluation_strategy = "epoch"
training_args.save_strategy = "epoch"
training_args.seed = seed
trainer = Trainer(
    model=lora_model_trained,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 0.42082056403160095,
 'eval_accuracy': 0.832,
 'eval_runtime': 6.0819,
 'eval_samples_per_second': 82.212,
 'eval_steps_per_second': 10.359}

In [14]:
lora_model_trained = AutoPeftModelForSequenceClassification.from_pretrained("distilbert_lora_rank16")
training_args = TrainingArguments("test_trainer")
training_args.per_device_train_batch_size = 8
training_args.per_device_eval_batch_size = 8
training_args.evaluation_strategy = "epoch"
training_args.save_strategy = "epoch"
training_args.seed = seed
trainer = Trainer(
    model=lora_model_trained,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 0.392722487449646,
 'eval_accuracy': 0.82,
 'eval_runtime': 6.0868,
 'eval_samples_per_second': 82.145,
 'eval_steps_per_second': 10.35}