In [2]:
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc",download_mode="reuse_dataset_if_exists")
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [40]:
from transformers import AutoTokenizer
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [41]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [42]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [43]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [44]:
from transformers import TrainingArguments

training_args = TrainingArguments("out_files",
                                  per_device_train_batch_size=32,
                                  num_train_epochs=3,
                                  logging_steps=50,
                                  evaluation_strategy='epoch')

In [45]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Using the latest cached version of the module from C:\Users\admin\.cache\huggingface\modules\datasets_modules\metrics\accuracy\9756d5fa4a0f9da966341741fc3926eafdc604b8276add51d5abbaa8958a25f9 (last modified on Mon May 13 15:29:26 2024) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.


In [50]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_feat

In [62]:
from peft import AdaLoraConfig,TaskType,get_peft_model
config = AdaLoraConfig(
peft_type="ADALORA", task_type=TaskType.SEQ_CLS, r=8, lora_alpha=32, target_modules=["q","v"],
lora_dropout=0.01,
)
model = get_peft_model(model, config)
model

PeftModelForSequenceClassification(
  (base_model): AdaLoraModel(
    (model): PeftModelForSequenceClassification(
      (base_model): AdaLoraModel(
        (model): AdaLoraModel(
          (model): PeftModelForSequenceClassification(
            (base_model): LoraModel(
              (model): T5ForSequenceClassification(
                (transformer): T5Model(
                  (shared): Embedding(32128, 768)
                  (encoder): T5Stack(
                    (embed_tokens): Embedding(32128, 768)
                    (block): ModuleList(
                      (0): T5Block(
                        (layer): ModuleList(
                          (0): T5LayerSelfAttention(
                            (SelfAttention): T5Attention(
                              (q): adalora.SVDLinear(
                                (base_layer): lora.Linear(
                                  (base_layer): Linear(in_features=768, out_features=768, bias=False)
                                  (lora_dr

In [53]:
# from peft import LoraConfig, TaskType
# from peft import get_peft_model
# 
# peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

trainable params: 884,736 || all params: 224,380,418 || trainable%: 0.3943017879572717


In [60]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [61]:
trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']