In [3]:
from transformers import AutoModelForSequenceClassification

# Load the base model
base_model_name = "microsoft/deberta-v3-small"
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)

# Print out the model architecture to check module names
print(base_model)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [4]:
from transformers import AutoModelForSequenceClassification
from peft import PeftModel, LoraConfig
from torch import nn

# Load the base model
base_model_name = "microsoft/deberta-v3-small"
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)

# Load PEFT (e.g., LoRA or adapters)

peft_config = LoraConfig(
    r=8,  # rank
    lora_alpha=16,  # scaling factor for the rank
    lora_dropout=0.1,  # dropout rate
    target_modules=["attention.self.query_proj", "attention.self.key_proj", "attention.self.value_proj"],  # specify the layers
)
peft_model = PeftModel(base_model,peft_config)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
peft_model

PeftModel(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128100, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): StableDropout()
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-5): 6 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B

In [6]:
peft_model.base_model.model.deberta.encoder.layer[0].attention

DebertaV2Attention(
  (self): DisentangledSelfAttention(
    (query_proj): lora.Linear(
      (base_layer): Linear(in_features=768, out_features=768, bias=True)
      (lora_dropout): ModuleDict(
        (default): Dropout(p=0.1, inplace=False)
      )
      (lora_A): ModuleDict(
        (default): Linear(in_features=768, out_features=8, bias=False)
      )
      (lora_B): ModuleDict(
        (default): Linear(in_features=8, out_features=768, bias=False)
      )
      (lora_embedding_A): ParameterDict()
      (lora_embedding_B): ParameterDict()
      (lora_magnitude_vector): ModuleDict()
    )
    (key_proj): lora.Linear(
      (base_layer): Linear(in_features=768, out_features=768, bias=True)
      (lora_dropout): ModuleDict(
        (default): Dropout(p=0.1, inplace=False)
      )
      (lora_A): ModuleDict(
        (default): Linear(in_features=768, out_features=8, bias=False)
      )
      (lora_B): ModuleDict(
        (default): Linear(in_features=8, out_features=768, bias=False)
 

In [7]:
import torch
import torch.nn as nn

class SelfAttentionFilter(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, hidden_states):
        # Compute Q, K, V
        Q = self.query(hidden_states)
        K = self.key(hidden_states)
        V = self.value(hidden_states)

        # Compute scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-1, -2)) / (hidden_states.size(-1) ** 0.5)
        attention_weights = self.softmax(scores)

        # Apply the attention weights
        filtered_output = torch.matmul(attention_weights, V)
        return filtered_output


In [8]:
class CustomDebertaV3PeftModel(PeftModel):
    def __init__(self, base_model,peft_config):
        super().__init__(base_model,peft_config)
        self.hidden_size = base_model.config.hidden_size

        # Create self-attention filters for each attention layer
        self.filters = nn.ModuleList([
            SelfAttentionFilter(self.hidden_size)
            for _ in range(self.count_attention_layers())
        ])
    
    def count_attention_layers(self):
        # Count the attention layers in DeBERTaV3-small
        return sum(1 for name, module in self.base_model.named_modules() if "attention" in name.lower())

    def forward(self, input_ids, attention_mask=None, **kwargs):
        hidden_states = self.base_model.model.deberta.embeddings(input_ids)  # Embed input
        attention_layer_index = 0

        for name, module in self.base_model.model.deberta.encoder.named_children():
            # Apply attention layers with filter in between
            if "layer" in name.lower():  # Adjust to match DeBERTa's naming
                for sub_name, sub_module in module.named_children():
                    if "attention" in sub_name.lower():
                        # Apply the attention layer
                        hidden_states = sub_module(hidden_states, attention_mask)

                        # Apply the self-attention filter
                        hidden_states = self.filters[attention_layer_index](hidden_states)
                        attention_layer_index += 1
                    else:
                        # Apply other submodules (e.g., feed-forward layers)
                        hidden_states = sub_module(hidden_states, attention_mask)
        
        # Final classification head
        pooled_output = self.base_model.pooler(hidden_states)
        logits = self.base_model.classifier(pooled_output)
        return logits


In [9]:
# Wrap the PEFT model with custom logic
custom_peft_model = CustomDebertaV3PeftModel(peft_model,peft_config)



In [15]:
custom_peft_model.forward(torch.randint(low=0,high=100,size=(768,10)))

TypeError: 'NoneType' object is not subscriptable