In [1]:
%%capture
!pip uninstall -y wandb
!pip install -q accelerate==0.26.1 transformers==4.35.2 datasets==2.16.1 peft==0.7.1 evaluate==0.4.0

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModelForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DefaultDataCollator
from evaluate import load as load_metric
from transformers import EarlyStoppingCallback
import os

2024-03-29 12:15:27.396007: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 12:15:27.396119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 12:15:27.571997: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preparation

In [3]:
data = load_dataset('SetFit/20_newsgroups').remove_columns(['label_text'])
idx2label = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
    'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
    'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
]
data['train'][0]

Downloading readme:   0%|          | 0.00/734 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.91M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'label': 7}

# Setting Up Tokenizer, Model and PEFT Config

In [4]:
model_id = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1)

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(idx2label))
model = get_peft_model(model, peft_config)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data

Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7532
    })
})

# Setting Up Trainer¶

In [6]:
training_args = TrainingArguments(
    output_dir="./logs", 
    evaluation_strategy="epoch",          
    save_strategy="epoch",
    num_train_epochs=25,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    load_best_model_at_end=True,
    learning_rate=5e-4
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [7]:
finetuned_model_id = "./lora_finetuned_model"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.8032,1.23632
2,1.1725,1.165973
3,0.8943,1.106379
4,0.8098,1.116172
5,0.6938,1.146378
6,0.6254,1.209918




# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(idx2label))
peft_model = PeftModelForSequenceClassification.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
)

model = peft_model.merge_and_unload()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation

In [9]:
from scipy.special import softmax

pred_output = Trainer(model=model).predict(tokenized_data['test'])

f1_metric = load_metric('f1')
scores = f1_metric.compute(
    references=pred_output.label_ids,
    predictions=softmax(pred_output.predictions, axis=1).argmax(axis=1),
    average=None
)
for label, score in zip(idx2label, scores['f1']):
    print(label, score)



Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

alt.atheism 0.4807965860597439
comp.graphics 0.629582806573957
comp.os.ms-windows.misc 0.605
comp.sys.ibm.pc.hardware 0.5201149425287356
comp.sys.mac.hardware 0.6387176325524044
comp.windows.x 0.7714987714987716
misc.forsale 0.7242921013412817
rec.autos 0.6164801627670395
rec.motorcycles 0.7144622991347341
rec.sport.baseball 0.8476821192052981
rec.sport.hockey 0.8560885608856088
sci.crypt 0.6758104738154614
sci.electronics 0.5564516129032259
sci.med 0.8020176544766708
sci.space 0.7922077922077922
soc.religion.christian 0.7080890973036341
talk.politics.guns 0.5956354300385109
talk.politics.mideast 0.7793923381770145
talk.politics.misc 0.4908180300500835
talk.religion.misc 0.12461059190031153


In [10]:
scores = f1_metric.compute(
    references=pred_output.label_ids,
    predictions=softmax(pred_output.predictions, axis=1).argmax(axis=1),
    average='weighted'
)
scores

{'f1': 0.6600370149642942}