In [1]:
from datasets import load_dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from shared.eval import compute_metrics

import torch
import numpy as np

In [2]:
model_checkpoint = 'distilbert-base-uncased'

# define
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

In [3]:
# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_checkpoint,
    add_prefix_space=True
)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize_function(examples):
    text = examples["text"]

    # tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenizer_inputs = tokenizer(
        text=text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenizer_inputs



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# define a list of examples
text_list = [
    "It was good.",
    "Not a fan, don't recommend.",
    "Better than the first one.",
    "This is not worth watching even once.",
    "This one is a pass.",
]

print("Untrained model predictions")
print("---------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
 
    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions
---------------------------
It was good. - Positive
Not a fan, don't recommend. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


In [5]:

print("---------------------------")
print("---------------------------")

---------------------------
---------------------------


In [6]:
dataset = load_dataset("shawhin/imdb-truncated")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [7]:
# LoRA configuration
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['q_lin']
)

model = get_peft_model(model, peft_config=peft_config)
model.print_trainable_parameters()

# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

# define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# create trainer object

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5706003308296204, 'eval_accuracy': 0.831, 'eval_runtime': 56.5136, 'eval_samples_per_second': 17.695, 'eval_steps_per_second': 4.424, 'epoch': 1.0}
{'loss': 0.4457, 'grad_norm': 21.62677001953125, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5944080948829651, 'eval_accuracy': 0.846, 'eval_runtime': 41.3169, 'eval_samples_per_second': 24.203, 'eval_steps_per_second': 6.051, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5694859623908997, 'eval_accuracy': 0.875, 'eval_runtime': 42.6747, 'eval_samples_per_second': 23.433, 'eval_steps_per_second': 5.858, 'epoch': 3.0}
{'loss': 0.2101, 'grad_norm': 0.09236983209848404, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5656962990760803, 'eval_accuracy': 0.893, 'eval_runtime': 45.7669, 'eval_samples_per_second': 21.85, 'eval_steps_per_second': 5.462, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7594931721687317, 'eval_accuracy': 0.888, 'eval_runtime': 44.9865, 'eval_samples_per_second': 22.229, 'eval_steps_per_second': 5.557, 'epoch': 5.0}
{'loss': 0.0676, 'grad_norm': 0.006757634691894054, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.829696536064148, 'eval_accuracy': 0.887, 'eval_runtime': 46.8909, 'eval_samples_per_second': 21.326, 'eval_steps_per_second': 5.332, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8393865823745728, 'eval_accuracy': 0.89, 'eval_runtime': 46.5738, 'eval_samples_per_second': 21.471, 'eval_steps_per_second': 5.368, 'epoch': 7.0}
{'loss': 0.0192, 'grad_norm': 0.0001854841539170593, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8974039554595947, 'eval_accuracy': 0.887, 'eval_runtime': 43.7873, 'eval_samples_per_second': 22.838, 'eval_steps_per_second': 5.709, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.9459480047225952, 'eval_accuracy': 0.891, 'eval_runtime': 46.1109, 'eval_samples_per_second': 21.687, 'eval_steps_per_second': 5.422, 'epoch': 9.0}
{'loss': 0.0034, 'grad_norm': 0.0033366032876074314, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.9698507189750671, 'eval_accuracy': 0.885, 'eval_runtime': 44.067, 'eval_samples_per_second': 22.693, 'eval_steps_per_second': 5.673, 'epoch': 10.0}
{'train_runtime': 1261.3616, 'train_samples_per_second': 7.928, 'train_steps_per_second': 1.982, 'train_loss': 0.1492086606502533, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.1492086606502533, metrics={'train_runtime': 1261.3616, 'train_samples_per_second': 7.928, 'train_steps_per_second': 1.982, 'total_flos': 1112883852759936.0, 'train_loss': 0.1492086606502533, 'epoch': 10.0})

In [12]:
best_checkpoint = trainer.state.best_model_checkpoint
print(best_checkpoint)

distilbert-base-uncased-lora-text-classification/checkpoint-1000


In [13]:
model.to('mps')

print("Trained model predictions:")
print("---------------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps")

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
---------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Negative


In [15]:
# save model locally
save_directory = model_checkpoint + "-lora-text-classification" + "/final"

trainer.model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to distilbert-base-uncased-lora-text-classification/final


In [17]:
# test

model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

model.to('mps')

print("Trained model predictions:")
print("---------------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps")

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trained model predictions:
---------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Negative
