In [1]:
%%capture
!pip install accelerate==0.26.1 transformers==4.35.2 datasets==2.16.1 peft==0.7.1 evaluate==0.4.0
!pip uninstall wandb -y

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DefaultDataCollator
from evaluate import load as load_metric
import os



# Data preparation

In [3]:
data = load_dataset('SetFit/20_newsgroups').remove_columns(['label_text'])
idx2label = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
    'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
    'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
]
data['train'][0]

Downloading readme:   0%|          | 0.00/734 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.91M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'label': 7}

# Setting Up Tokenizer, Model and PEFT Config

In [4]:
model_id = "microsoft/deberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1)

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(idx2label))
model = get_peft_model(model, peft_config)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data

Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7532
    })
})

# Setting Up Trainer¶

In [6]:
training_args = TrainingArguments(
    output_dir="./logs", 
    evaluation_strategy="epoch",          
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
)

In [7]:
finetuned_model_id = "./deberta-finetuned"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.9945,2.989354
2,2.9269,2.515925
3,2.3129,2.106838
4,2.0673,1.855309
5,1.7876,1.679237
6,1.6527,1.53723
7,1.5515,1.422956
8,1.3954,1.356065
9,1.3479,1.328373
10,1.3115,1.319608




# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(idx2label))
peft_model = PeftModel.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
)

model = peft_model.merge_and_unload()

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation

In [9]:
from scipy.special import softmax

pred_output = Trainer(model=model).predict(tokenized_data['test'])

f1_metric = load_metric('f1')
scores = f1_metric.compute(
    references=pred_output.label_ids,
    predictions=softmax(pred_output.predictions, axis=1).argmax(axis=1),
    average=None
)
for label, score in zip(idx2label, scores['f1']):
    print(label, score)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

alt.atheism 0.0
comp.graphics 0.015113350125944584
comp.os.ms-windows.misc 0.00676818950930626
comp.sys.ibm.pc.hardware 0.0
comp.sys.mac.hardware 0.07692307692307693
comp.windows.x 0.0
misc.forsale 0.0
rec.autos 0.00856898029134533
rec.motorcycles 0.0
rec.sport.baseball 0.014166455856311664
rec.sport.hockey 0.017605633802816902
sci.crypt 0.0
sci.electronics 0.0
sci.med 0.0
sci.space 0.0
soc.religion.christian 0.0
talk.politics.guns 0.005449591280653952
talk.politics.mideast 0.009685230024213074
talk.politics.misc 0.0
talk.religion.misc 0.0
