In [1]:
%%capture
!pip uninstall -y wandb
!pip install -q accelerate==0.26.1 transformers==4.35.2 datasets==2.16.1 peft==0.7.1 evaluate==0.4.0

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load as load_metric
from transformers import EarlyStoppingCallback
import os

2024-03-31 09:22:53.183331: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 09:22:53.183456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 09:22:53.353890: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preparation

In [3]:
data = load_dataset('SetFit/20_newsgroups').remove_columns(['label']).rename_columns({'label_text': 'label'})
idx2label = [
    'alt.atheism', 
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
    'misc.forsale', 
    'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 
    'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
    'soc.religion.christian', 
    'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
]
label2idx = {v:k for k, v in enumerate(idx2label)}
data['train'][0]

Downloading readme:   0%|          | 0.00/734 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.91M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'label': 'rec.autos'}

# Setting Up Tokenizer, Model and PEFT Config

In [4]:
model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, r=8, lora_alpha=16, lora_dropout=0.1)

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model = get_peft_model(model, peft_config)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def tokenize_function(sample, i_max_length=256, o_max_length=16, padding="max_length"):
    model_inputs = tokenizer(
        sample["text"], max_length=i_max_length, padding=padding, truncation=True
    )
    labels = tokenizer(
        text_target=sample["label"], max_length=o_max_length, padding=padding, truncation=True
    )
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = data.map(tokenize_function, batched=True, remove_columns=['text', 'label'])
tokenized_data

Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7532
    })
})

# Setting Up Trainer¶

In [6]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    learning_rate=5e-4,
    num_train_epochs=25,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    gradient_accumulation_steps=8
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [7]:
finetuned_model_id = "./lora_finetuned_model"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,0.382555
2,No log,0.199997
4,No log,0.159613
6,0.613400,0.152085
8,0.613400,0.147469
10,0.613400,0.142963
12,0.163700,0.14095
14,0.163700,0.139873
16,0.138900,0.138958
18,0.138900,0.140224




# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_id, num_labels=len(idx2label))
peft_model = PeftModelForSeq2SeqLM.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
)

model = peft_model.merge_and_unload()

# Evaluation

In [9]:
from transformers import GenerationConfig

def predict(idx):
    return tokenizer.decode(model.generate(
        input_ids=torch.tensor(tokenized_data['test']['input_ids'][idx:idx+1]), 
        generation_config=GenerationConfig(max_new_tokens=16)
    )[0][1:-1])

In [10]:
from tqdm import trange

predictions = [predict(i) for i in trange(len(tokenized_data['test']))]
references = data['test']['label']

100%|██████████| 7532/7532 [3:57:59<00:00,  1.90s/it]


In [11]:
metric = load_metric("exact_match")
scores = metric.compute(predictions=predictions, references=references)
scores

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

{'exact_match': 0.6190918746680828}