# [Parameter-Efficient Fine-Tuning (PEFT)](https://huggingface.co/docs/peft/quicktour)

PEFT methods selectively adjust a small set of additional model parameters while keeping the majority of the pre-trained LLM's parameters unchanged. This significantly reduces computational and storage requirements and addresses the problem of catastrophic forgetting often seen during full fine-tuning. Additionally, PEFT methods outperform traditional fine-tuning in situations with limited data and demonstrate superior generalization to out-of-domain scenarios.

In [3]:
!pip install peft
# !pip install evaluate
# !pip install datasets

Collecting peft
  Downloading peft-0.15.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.15.0-py3-none-any.whl (410 kB)
Installing collected packages: peft
Successfully installed peft-0.15.0


In [None]:
import transformers
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.49.0'

In [4]:
import peft
peft.__version__

'0.15.0'

In [18]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator

import os
import torch
# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


| Techniques     | All Params  | Trainable Params | Trainable % |
|----------------|-------------|------------------|-------------|
| BitFit         | 124,808,448 | 102,144          | 0.082       |
| Adapter        | 124,808,448 | 894,528          | 0.714       |
| Prompt Tuning  | 124,808,448 | 6,144            | 0.004       |
| Prefix  Tuning | 124,808,448 | 368,640          | 0.295       |
| LoRA           | 124,808,448 | 294,912          | 0.236       |

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

## BitFit

<img src = "../figures/bitfit.pbm" width=700>

In [7]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, GPT2TokenizerFast

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

model = AutoModelForCausalLM.from_pretrained('gpt2')

In [8]:
# Freeze all parameters except biases
for name, param in model.named_parameters():
    if 'bias' not in name:
        param.requires_grad = False

In [9]:
print_trainable_parameters(model)

trainable params: 102144 || all params: 124439808 || trainable%: 0.08208305818022477


## Adapter

You can read more [Adapter paper](https://arxiv.org/pdf/1902.00751).

<img src = "../figures/adapter.webp" width=500>

In [7]:
# Caution!!!! Adapter-transformers have depreacted and conflicted 
# adapter-transformers required transformers version 4.28.1 which is quite old already
# there is no need to use anymore
# !pip install adapter-transformers
from transformers.adapters import GPT2AdapterModel
from transformers import AutoConfig

model_name_or_path = "gpt2"
config = AutoConfig.from_pretrained(model_name_or_path)
model = GPT2AdapterModel(config)
model.freeze_model()
model.add_causal_lm_head(
    head_name = 'lm_head',
)
model.add_adapter(adapter_name = 'adapter-name')
model

GPT2AdapterModel(
  (shared_parameters): ModuleDict()
  (transformer): GPT2Model(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): MergedLinear(
            in_features=768, out_features=2304, bias=True
            (loras): ModuleDict()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (prefix_tuning): PrefixTuningShim(
            (prefix_gates): ModuleDict()
            (pool): PrefixTuningPool(
              (prefix_tunings): ModuleDict()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
  

In [10]:
print_trainable_parameters(model)

trainable params: 102144 || all params: 124439808 || trainable%: 0.08208305818022477


## Prompt Tuning
[Soft prompt tuning](https://arxiv.org/pdf/2104.08691) (Lester et al. 2021) concatenates the embeddings of the input tokens with a trainable tensor that can be optimized via backpropagation to improve the modeling performance on a target task. These soft prompts are continuous vectors in the model's embedding space that get optimized during training. Unlike traditional discrete prompts that use natural language tokens, soft prompts have no inherent meaning but learn to elicit the desired behavior from the frozen model through gradient descent. The technique is particularly effective for multi-task scenarios since each task requires storing only a small prompt vector (typically a few hundred parameters) rather than a full model copy. This approach not only maintains a minimal memory footprint but also enables rapid task switching by simply swapping prompt vectors without any model reloading.



<img src = "../figures/prompt.webp" width=500>

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import (
    get_peft_model, 
    PromptTuningInit, 
    PromptTuningConfig, 
    TaskType, 
    )

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

# Load base model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

peft_config = PromptTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=20,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

# Create prompt-tunable model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 15,360 || all params: 124,455,168 || trainable%: 0.0123


## Prefix Tuning
To add trainable tensors to each transformer block instead of only the input embeddings, as in soft prompt tuning. Also, we obtain the soft prompt embedding via fully connected layers

<img src = "../figures/prefix_adapterhub.png" width=300>

In [12]:
from transformers import AutoModel, AutoModelForCausalLM
from peft import get_peft_model, PrefixTuningConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    num_virtual_tokens=20
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 368,640 || all params: 124,808,448 || trainable%: 0.2954


## LoRA (Low-Rank Adaptation)

LoRA works by adding pairs of rank decomposition matrices to transformer layers, typically focusing on attention weights. During inference, these adapter weights can be merged with the base model, resulting in no additional latency overhead. LoRA is particularly useful for adapting large language models to specific tasks or domains while keeping resource requirements manageable. You can read more about LoRA in the [LoRA paper](https://arxiv.org/pdf/2106.09685).

LoRA works by adding pairs of rank decomposition matrices to transformer layers, typically focusing on attention weights. During inference, these adapter weights can be merged with the base model, resulting in no additional latency overhead. LoRA is particularly useful for adapting large language models to specific tasks or domains while keeping resource requirements manageable.

<img src = "../figures/lora-2.png" width=500>

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, TaskType, get_peft_model

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

# Load base model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
    bias="none",
)

# Create prompt-tunable model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [14]:
from transformers import AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

model_name_or_path = "gpt2"
tokenizer_name_or_path = "gpt2"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)


model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


### Training with LoRA

[explore more](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling)

### Inference

## Comparison to Other Methods

When compared to other PEFT approaches, prompt tuning stands out for its efficiency. While LoRA offers low parameter counts and memory usage but requires loading adapters for task switching, prompt tuning achieves even lower resource usage and enables immediate task switching. Full fine-tuning, in contrast, demands significant resources and requires separate model copies for different tasks.

| Method | Parameters | Memory | Task Switching |
|--------|------------|---------|----------------|
| Prompt Tuning | Very Low | Minimal | Easy |
| LoRA | Low | Low | Requires Loading |
| Full Fine-tuning | High | High | New Model Copy |

When implementing prompt tuning, start with a small number of virtual tokens (8-16) and increase only if the task complexity demands it. Text initialization typically yields better results than random initialization, especially when using task-relevant text. The initialization strategy should reflect the complexity of your target task.

Training requires slightly different considerations than full fine-tuning. Higher learning rates often work well, but careful monitoring of prompt token gradients is essential. Regular validation on diverse examples helps ensure robust performance across different scenarios.


In [None]:
from datasets import load_dataset

# Load the simplified GoEmotions dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})


In [None]:
from transformers import AutoTokenizer

# Load the GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    
    # Convert labels to a 2D tensor (batch size × number of labels)
    tokenized["labels"] = [[float(label) for label in labels] for labels in examples["labels"]]
    
    return tokenized

In [5]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map: 100%|██████████| 43410/43410 [00:02<00:00, 21281.47 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 21111.00 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 20394.47 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})





In [16]:
train_dataset[0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej',
 'input_ids': [3666,
  12507,
  2057,
  318,
  1997,
  314,
  1422,
  470,
  423,
  284,
  4255,
  3589,
  13,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,

In [13]:
# Split the dataset into train and eval sets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [7]:
from transformers import GPT2Config, GPT2ForSequenceClassification

def initialize_student_model(teacher_model, layers_to_copy):
    """
    Initialize a 6-layer student model using specific layers from the 12-layer teacher model.
    
    Args:
        teacher_model: The 12-layer GPT-2 teacher model.
        layers_to_copy: List of layer indices to copy from the teacher model (e.g., [1, 3, 5, 7, 9, 11] for odd layers).
    
    Returns:
        student_model: The 6-layer student model initialized with the specified layers.
    """
    # Define a 6-layer GPT-2 configuration
    config = GPT2Config.from_pretrained("gpt2", num_hidden_layers=6, num_labels=28)  # 28 emotions in GoEmotions
    
    # Load the 6-layer GPT-2 model
    student_model = GPT2ForSequenceClassification(config)
    
    # Copy weights from the teacher model to the student model
    for i, layer_idx in enumerate(layers_to_copy):
        # Copy transformer layer weights
        student_model.transformer.h[i].load_state_dict(teacher_model.transformer.h[layer_idx].state_dict())
    
    # Copy embedding and positional embedding weights
    student_model.transformer.wte.load_state_dict(teacher_model.transformer.wte.state_dict())
    student_model.transformer.wpe.load_state_dict(teacher_model.transformer.wpe.state_dict())
    
    # Copy the classification head weights (if applicable)
    if hasattr(teacher_model, "classifier"):
        student_model.classifier.load_state_dict(teacher_model.classifier.state_dict())
    
    return student_model

In [6]:
from torch import nn
from transformers import GPT2ForSequenceClassification

class GPT2ForMultiLabelSequenceClassification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        # Replace the default classification head with a multi-label head
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get the outputs from the base GPT-2 model
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        
        # Get the hidden states of the last layer
        hidden_states = outputs.last_hidden_state
        
        # Get the pooled output (CLS token for classification)
        pooled_output = hidden_states[:, 0, :]
        
        # Pass through the classification head
        logits = self.classifier(pooled_output)
        
        # Apply sigmoid for multi-label classification
        logits = self.sigmoid(logits)
        
        # Compute the loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.BCELoss()  # Binary Cross-Entropy Loss for multi-label classification
            loss = loss_fct(logits, labels)
        
        return {"loss": loss, "logits": logits}

In [8]:
# Define a 6-layer GPT-2 configuration
config = GPT2Config.from_pretrained("gpt2", num_hidden_layers=6, num_labels=28)  # 28 emotions in GoEmotions

# Load the 6-layer GPT-2 model with the custom multi-label head
student_model_odd = GPT2ForMultiLabelSequenceClassification(config)
student_model_even = GPT2ForMultiLabelSequenceClassification(config)

In [9]:
from peft import LoraConfig, TaskType, get_peft_model

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    bias="none",  # No bias
)

# Apply LoRA to the Odd Layer student model
student_model_odd = get_peft_model(student_model_odd, peft_config)
student_model_odd.print_trainable_parameters()

# Apply LoRA to the Even Layer student model
student_model_even = get_peft_model(student_model_even, peft_config)
student_model_even.print_trainable_parameters()

trainable params: 190,492 || all params: 82,146,104 || trainable%: 0.2319
trainable params: 190,492 || all params: 82,146,104 || trainable%: 0.2319




In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="gpt2-lora-6layer-goemotions-simplified",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
)






In [None]:
# 
# 
# 

In [5]:
from transformers import GPT2Config, GPT2ForSequenceClassification

# Define a 6-layer GPT-2 configuration
config = GPT2Config.from_pretrained("gpt2", num_hidden_layers=6, num_labels=28)  # 28 emotions in GoEmotions

# Load the 6-layer GPT-2 model
student_model = GPT2ForSequenceClassification(config)

In [6]:
from peft import LoraConfig, TaskType, get_peft_model

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    bias="none",  # No bias
)

# Apply LoRA to the 6-layer student model
student_model = get_peft_model(student_model, peft_config)

# Print trainable parameters
student_model.print_trainable_parameters()

trainable params: 168,960 || all params: 82,103,040 || trainable%: 0.2058




In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="gpt2-lora-6layer-goemotions-simplified",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
)

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [3]:
# Import necessary libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
import evaluate
import numpy as np





In [4]:
# Load GoEmotions dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
num_labels = 28  # GoEmotions has 27 emotions + neutral

In [5]:
# Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
# Function to process multi-label to single-label
def process_labels(example):
    # Take the first label as the primary emotion
    example['labels'] = example['labels'][0] if example['labels'] else 0  # Default to 0 (neutral) if empty
    return example

In [7]:
# Create 6-layer student model configurations
def create_student_model(initialization_type="lora"):
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels,
        dropout=0.1,
        attention_dropout=0.1
    )
    
    num_layers = len(base_model.distilbert.transformer.layer)  # Should be 6
    
    if initialization_type == "lora":
        peft_config = LoraConfig(
            task_type="SEQ_CLS",
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_lin", "v_lin"]
        )
        model = get_peft_model(base_model, peft_config)
    
    elif initialization_type == "odd":
        pretrained = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            dropout=0.1,
            attention_dropout=0.1
        )
        odd_mapping = [1, 3, 5]
        for i, target_idx in enumerate(range(0, num_layers, 2)):
            if i < len(odd_mapping):
                base_model.distilbert.transformer.layer[target_idx] = pretrained.distilbert.transformer.layer[odd_mapping[i]]
        model = base_model
    
    elif initialization_type == "even":
        pretrained = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            dropout=0.1,
            attention_dropout=0.1
        )
        even_mapping = [0, 2, 4]
        for i, target_idx in enumerate(range(0, num_layers, 2)):
            if i < len(even_mapping):
                base_model.distilbert.transformer.layer[target_idx] = pretrained.distilbert.transformer.layer[even_mapping[i]]
        model = base_model
    
    return model

In [8]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Prepare dataset
# First process labels, then tokenize
tokenized_datasets = dataset.map(process_labels)
tokenized_datasets = tokenized_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'id'])
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Split dataset
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

Map: 100%|██████████| 5426/5426 [00:00<00:00, 8032.91 examples/s]


In [9]:
# Define metrics
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1": f1}

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True if torch.cuda.is_available() else False,
    report_to="none"
)

In [None]:
# Function to train and evaluate model
def train_and_evaluate(model, name):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    print(f"\nTraining {name} model...")
    trainer.train()
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{name} - Trainable params: {trainable_params} || All params: {total_params}")
    
    eval_results = trainer.evaluate()
    print(f"{name} - Evaluation results: {eval_results}")
    return eval_results

# Train and compare all three configurations
models = {
    "LoRA": create_student_model("lora"),
    "Odd Layer": create_student_model("odd"),
    "Even Layer": create_student_model("even")
}

results = {}
for name, model in models.items():
    model.to(device)
    results[name] = train_and_evaluate(model, name)

# Compare results
print("\nPerformance Comparison:")
print("| Model | Accuracy | F1 Score | Trainable Parameters |")
print("|-------|----------|----------|---------------------|")
for name, result in results.items():
    trainable_params = sum(p.numel() for p in models[name].parameters() if p.requires_grad)
    print(f"| {name} | {result['eval_accuracy']:.4f} | {result['eval_f1']:.4f} | {trainable_params} |")

# Save the best model
best_model = models["LoRA"]
best_model.save_pretrained("./best_lora_model")
tokenizer.save_pretrained("./best_lora_model")

Map: 100%|██████████| 43410/43410 [00:01<00:00, 38231.09 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 30473.60 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 31927.28 examples/s]
Map: 100%|██████████| 43410/43410 [00:01<00:00, 28352.71 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 24453.64 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 26187.17 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']


Training LoRA model...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.8585,1.749838,0.488299,0.41359
2,1.6418,1.606364,0.521835,0.476836
3,1.5595,1.578487,0.52939,0.486098


LoRA - Trainable params: 759580 || All params: 67734584


LoRA - Evaluation results: {'eval_loss': 1.5784870386123657, 'eval_accuracy': 0.529390086604017, 'eval_f1': 0.4860976967543385, 'eval_runtime': 8.7276, 'eval_samples_per_second': 621.824, 'eval_steps_per_second': 38.957, 'epoch': 3.0}


  trainer = Trainer(



Training Odd Layer model...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6001,1.519223,0.551502,0.524302
2,1.3463,1.440848,0.56956,0.557456
3,1.1605,1.438388,0.571955,0.557387


Odd Layer - Trainable params: 66975004 || All params: 66975004


Odd Layer - Evaluation results: {'eval_loss': 1.4383876323699951, 'eval_accuracy': 0.5719550396167311, 'eval_f1': 0.557387104127604, 'eval_runtime': 8.6893, 'eval_samples_per_second': 624.559, 'eval_steps_per_second': 39.128, 'epoch': 3.0}


  trainer = Trainer(



Training Even Layer model...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5037,1.441789,0.571402,0.548567
2,1.2563,1.389252,0.577667,0.567097
3,1.0672,1.40071,0.582458,0.570338


Even Layer - Trainable params: 66975004 || All params: 66975004


Even Layer - Evaluation results: {'eval_loss': 1.3892524242401123, 'eval_accuracy': 0.5776672194582643, 'eval_f1': 0.5670966472203001, 'eval_runtime': 9.0121, 'eval_samples_per_second': 602.193, 'eval_steps_per_second': 37.727, 'epoch': 3.0}

Performance Comparison:
| Model | Accuracy | F1 Score | Trainable Parameters |
|-------|----------|----------|---------------------|
| LoRA | 0.5294 | 0.4861 | 759580 |
| Odd Layer | 0.5720 | 0.5574 | 66975004 |
| Even Layer | 0.5777 | 0.5671 | 66975004 |


('./best_lora_model\\tokenizer_config.json',
 './best_lora_model\\special_tokens_map.json',
 './best_lora_model\\vocab.txt',
 './best_lora_model\\added_tokens.json',
 './best_lora_model\\tokenizer.json')

## Test the model

In [13]:
# Import necessary libraries
from peft import PeftModel
from collections import Counter

# Load the trained LoRA model and tokenizer
model_path = "app/best_lora_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=28)
model = PeftModel.from_pretrained(base_model, model_path)
model = model.to(device)
model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [15]:
# Load GoEmotions dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
test_dataset = dataset["test"]

# Emotion labels from GoEmotions
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
    "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

In [16]:
# 1. Single Example Inference
def predict_emotion(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    
    return emotion_labels[prediction]

# Test single example
sample_text = "I can't believe how amazing this day turned out!"
print("\n1. Single Example Inference:")
print(f"Text: {sample_text}")
print(f"Predicted Emotion: {predict_emotion(sample_text)}")


1. Single Example Inference:
Text: I can't believe how amazing this day turned out!
Predicted Emotion: surprise


In [19]:
# 2. Batch Inference
def batch_predict(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    
    return [emotion_labels[pred] for pred in predictions]

# Test batch inference on 5 random test samples
print("\n2. Batch Inference (5 random test samples):")
random_samples = test_dataset.shuffle(seed=SEED).select(range(5))
sample_texts = random_samples['text']
true_labels = [emotion_labels[label[0]] if label else "neutral" for label in random_samples['labels']]
predicted_labels = batch_predict(sample_texts)

for text, true, pred in zip(sample_texts, true_labels, predicted_labels):
    print(f"Text: {text}")
    print(f"True Emotion: {true}")
    print(f"Predicted Emotion: {pred}")
    print()



2. Batch Inference (5 random test samples):
Text: It still wouldn't make sense. The problem isn't the looks
True Emotion: disapproval
Predicted Emotion: neutral

Text: It's actually pretty cool in here today, not hot at all
True Emotion: admiration
Predicted Emotion: admiration

Text: Funny part is There ISNT a wall (but maybe should be) and there ARE already plenty of "Sensible" Gun laws
True Emotion: amusement
Predicted Emotion: amusement

Text: Hahaha so I’m no longer going to worry about it then lol
True Emotion: amusement
Predicted Emotion: amusement

Text: So. Fucking. Tone. Deaf.
True Emotion: annoyance
Predicted Emotion: anger



In [20]:
# 3. Emotion Distribution Analysis
def get_emotion_distribution(dataset, num_samples=100):
    samples = dataset.shuffle(seed=SEED).select(range(num_samples))
    texts = samples['text']
    predictions = batch_predict(texts)
    
    dist = Counter(predictions)
    total = sum(dist.values())
    return {emotion: count/total for emotion, count in dist.items()}

print("\n3. Emotion Distribution Analysis (100 samples):")
dist = get_emotion_distribution(test_dataset)
for emotion, freq in sorted(dist.items(), key=lambda x: x[1], reverse=True):
    print(f"{emotion}: {freq:.3f}")

# 4. Error Analysis
def error_analysis(dataset, num_samples=50):
    samples = dataset.shuffle(seed=SEED).select(range(num_samples))
    texts = samples['text']
    true_labels = [label[0] if label else 0 for label in samples['labels']]
    predicted_labels = [emotion_labels.index(pred) for pred in batch_predict(texts)]
    
    errors = []
    for text, true, pred in zip(texts, true_labels, predicted_labels):
        if true != pred:
            errors.append({
                'text': text,
                'true': emotion_labels[true],
                'predicted': emotion_labels[pred]
            })
    
    return errors

print("\n4. Error Analysis (first 5 errors from 50 samples):")
errors = error_analysis(test_dataset)
for error in errors[:5]:
    print(f"Text: {error['text']}")
    print(f"True Emotion: {error['true']}")
    print(f"Predicted Emotion: {error['predicted']}")
    print()



3. Emotion Distribution Analysis (100 samples):
neutral: 0.430
admiration: 0.130
curiosity: 0.060
gratitude: 0.060
amusement: 0.050
anger: 0.040
joy: 0.030
approval: 0.030
surprise: 0.030
confusion: 0.020
love: 0.020
excitement: 0.010
caring: 0.010
remorse: 0.010
disappointment: 0.010
disgust: 0.010
annoyance: 0.010
desire: 0.010
optimism: 0.010
fear: 0.010
sadness: 0.010

4. Error Analysis (first 5 errors from 50 samples):
Text: It still wouldn't make sense. The problem isn't the looks
True Emotion: disapproval
Predicted Emotion: neutral

Text: So. Fucking. Tone. Deaf.
True Emotion: annoyance
Predicted Emotion: anger

Text: If you think all manufactured foods haven't got those contaminants then you're sadly mistaken.
True Emotion: disapproval
Predicted Emotion: neutral

Text: Have nobody thought of the giant tech corporation selling overpriced, inferior products to sheep customers? That's odd.
True Emotion: disapproval
Predicted Emotion: curiosity

Text: In what ways has [NAME] out-c