In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate -U
!pip install peft



In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
imdb_dataset = load_dataset("imdb")

N = 500

rand_idx = np.random.randint(24999, size=N)

x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                             'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 500
    })
})

In [5]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.498

In [6]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))



In [9]:
def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [10]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
accuracy = evaluate.load("accuracy")

In [13]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [14]:
sentiment_examples = [
    "The movie was an absolute delight from start to finish!",
    "I wouldn't recommend this film to anyone; it was quite a letdown.",
    "A truly captivating experience with stunning performances and a gripping plot.",
    "This film is a waste of time; it fails to engage or entertain.",
    "One of the best movies I've seen this year; it exceeded all my expectations.",
    "Unfortunately, this movie did not live up to the hype; it was very disappointing.",
    "A feel-good movie with a perfect balance of humor and drama.",
    "The plot was predictable and the characters were bland. Not worth watching.",
    "This movie is a must-watch; it’s both entertaining and thought-provoking.",
    "I was bored throughout; the story was lackluster and the pacing was off."
]


print("Untrained model predictions:")
print("----------------------------")
for text in sentiment_examples:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
The movie was an absolute delight from start to finish! - Negative
I wouldn't recommend this film to anyone; it was quite a letdown. - Negative
A truly captivating experience with stunning performances and a gripping plot. - Negative
This film is a waste of time; it fails to engage or entertain. - Negative
One of the best movies I've seen this year; it exceeded all my expectations. - Negative
Unfortunately, this movie did not live up to the hype; it was very disappointing. - Negative
A feel-good movie with a perfect balance of humor and drama. - Negative
The plot was predictable and the characters were bland. Not worth watching. - Negative
This movie is a must-watch; it’s both entertaining and thought-provoking. - Negative
I was bored throughout; the story was lackluster and the pacing was off. - Negative


In [15]:
sentiment_examples_with_category = {
    "The movie was an absolute delight from start to finish!": "Positive",
    "I wouldn't recommend this film to anyone; it was quite a letdown.": "Negative",
    "A truly captivating experience with stunning performances and a gripping plot.": "Positive",
    "This film is a waste of time; it fails to engage or entertain.": "Negative",
    "One of the best movies I've seen this year; it exceeded all my expectations.": "Positive",
    "Unfortunately, this movie did not live up to the hype; it was very disappointing.": "Negative",
    "A feel-good movie with a perfect balance of humor and drama.": "Positive",
    "The plot was predictable and the characters were bland. Not worth watching.": "Negative",
    "This movie is a must-watch; it’s both entertaining and thought-provoking.": "Positive",
    "I was bored throughout; the story was lackluster and the pacing was off.": "Negative"
}

In [16]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [17]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [18]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [19]:
lr = 1e-3
batch_size = 4
num_epochs = 1

In [20]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.342775,{'accuracy': 0.874}


Trainer is attempting to log a value of "{'accuracy': 0.874}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=125, training_loss=0.4798190307617187, metrics={'train_runtime': 1304.4871, 'train_samples_per_second': 0.383, 'train_steps_per_second': 0.096, 'total_flos': 56421610718880.0, 'train_loss': 0.4798190307617187, 'epoch': 1.0})

In [23]:
print("Trained model predictions:")
print("--------------------------")
for text in sentiment_examples:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
The movie was an absolute delight from start to finish! - Positive
I wouldn't recommend this film to anyone; it was quite a letdown. - Negative
A truly captivating experience with stunning performances and a gripping plot. - Positive
This film is a waste of time; it fails to engage or entertain. - Negative
One of the best movies I've seen this year; it exceeded all my expectations. - Positive
Unfortunately, this movie did not live up to the hype; it was very disappointing. - Negative
A feel-good movie with a perfect balance of humor and drama. - Positive
The plot was predictable and the characters were bland. Not worth watching. - Negative
This movie is a must-watch; it’s both entertaining and thought-provoking. - Positive
I was bored throughout; the story was lackluster and the pacing was off. - Negative


In [24]:
sentiment_examples_with_category = {
    "The movie was an absolute delight from start to finish!": "Positive",
    "I wouldn't recommend this film to anyone; it was quite a letdown.": "Negative",
    "A truly captivating experience with stunning performances and a gripping plot.": "Positive",
    "This film is a waste of time; it fails to engage or entertain.": "Negative",
    "One of the best movies I've seen this year; it exceeded all my expectations.": "Positive",
    "Unfortunately, this movie did not live up to the hype; it was very disappointing.": "Negative",
    "A feel-good movie with a perfect balance of humor and drama.": "Positive",
    "The plot was predictable and the characters were bland. Not worth watching.": "Negative",
    "This movie is a must-watch; it’s both entertaining and thought-provoking.": "Positive",
    "I was bored throughout; the story was lackluster and the pacing was off.": "Negative"
}