In [1]:
from datasets import load_dataset, Dataset,DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

import numpy as np
import torch
from peft import (
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig

)
import evaluate

In [2]:
model_checkpoint = "distilbert/distilbert-base-uncased"

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,num_labels=2, label2id=label2id, id2label=id2label
    
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset("shawhin/imdb-truncated")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, clean_up_tokenization_spaces=True)

In [6]:
def tokenizer_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"

    tokenized_output = tokenizer(
        text, truncation=True, max_length=512, return_tensors="np"
    )

    return tokenized_output


In [7]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))
tokenized_dataset = dataset.map(tokenizer_function, batched=True)

tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [11]:
text_list = ["A cinematic masterpiece! 10/10 would watch again.",
    "Underwhelming experience. The plot was predictable.",
    "The special effects were mind-blowing, but the story was lacking.",
    "I loved the cast, but the pacing was slow.",
    "A beautifully shot film with a compelling narrative.",
    "Not my cup of tea, but I can see why others would enjoy it.",
    "The dialogue was witty and the characters were well-developed.",
    "A disappointing sequel that didn't live up to the original.",
    "A thought-provoking film that left me questioning reality.",
    "A fun, lighthearted movie perfect for a night in.",
    "i hate this shit"]

print("Untrained model predictions:")
print("---------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")

    logits = model(inputs).logits
    
    predictions = torch.argmax(logits, dim=-1)
    
    # print(f"Input: {inputs}")
    # print(f"Logits: {logits}")
    # print(f"Predictions: {predictions.item()}")
    print(f"{text} - {id2label[predictions.item()]}")
    


Untrained model predictions:
---------------------------
A cinematic masterpiece! 10/10 would watch again. - NEGATIVE
Underwhelming experience. The plot was predictable. - NEGATIVE
The special effects were mind-blowing, but the story was lacking. - NEGATIVE
I loved the cast, but the pacing was slow. - NEGATIVE
A beautifully shot film with a compelling narrative. - NEGATIVE
Not my cup of tea, but I can see why others would enjoy it. - NEGATIVE
The dialogue was witty and the characters were well-developed. - NEGATIVE
A disappointing sequel that didn't live up to the original. - NEGATIVE
A thought-provoking film that left me questioning reality. - NEGATIVE
A fun, lighthearted movie perfect for a night in. - NEGATIVE
i hate this shit - NEGATIVE


In [12]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["q_lin"]
)

In [13]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [14]:
lr = 1e-3
batch_size = 8
epochs = 10

training_args = TrainingArguments(  
    output_dir="./lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    weight_decay=0.01
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.2768518626689911, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 21.929, 'eval_samples_per_second': 45.602, 'eval_steps_per_second': 5.7, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.39668869972229004, 'eval_accuracy': {'accuracy': 0.877}, 'eval_runtime': 21.9114, 'eval_samples_per_second': 45.638, 'eval_steps_per_second': 5.705, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.42255666851997375, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 22.009, 'eval_samples_per_second': 45.436, 'eval_steps_per_second': 5.679, 'epoch': 3.0}
{'loss': 0.2501, 'grad_norm': 5.104808330535889, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.47960373759269714, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 22.0767, 'eval_samples_per_second': 45.297, 'eval_steps_per_second': 5.662, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.5775741934776306, 'eval_accuracy': {'accuracy': 0.89}, 'eval_runtime': 22.1028, 'eval_samples_per_second': 45.243, 'eval_steps_per_second': 5.655, 'epoch': 5.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.5605072975158691, 'eval_accuracy': {'accuracy': 0.898}, 'eval_runtime': 22.2015, 'eval_samples_per_second': 45.042, 'eval_steps_per_second': 5.63, 'epoch': 6.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.6357624530792236, 'eval_accuracy': {'accuracy': 0.899}, 'eval_runtime': 22.1213, 'eval_samples_per_second': 45.205, 'eval_steps_per_second': 5.651, 'epoch': 7.0}
{'loss': 0.0437, 'grad_norm': 0.12736117839813232, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.724678635597229, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 22.1466, 'eval_samples_per_second': 45.154, 'eval_steps_per_second': 5.644, 'epoch': 8.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.690979540348053, 'eval_accuracy': {'accuracy': 0.901}, 'eval_runtime': 22.1523, 'eval_samples_per_second': 45.142, 'eval_steps_per_second': 5.643, 'epoch': 9.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.7199811935424805, 'eval_accuracy': {'accuracy': 0.894}, 'eval_runtime': 22.2845, 'eval_samples_per_second': 44.874, 'eval_steps_per_second': 5.609, 'epoch': 10.0}
{'train_runtime': 805.1337, 'train_samples_per_second': 12.42, 'train_steps_per_second': 1.553, 'train_loss': 0.11814543166160583, 'epoch': 10.0}


TrainOutput(global_step=1250, training_loss=0.11814543166160583, metrics={'train_runtime': 805.1337, 'train_samples_per_second': 12.42, 'train_steps_per_second': 1.553, 'total_flos': 1253694805157184.0, 'train_loss': 0.11814543166160583, 'epoch': 10.0})

In [17]:
model.to("cuda")

print("Trained model predictions:")
print("---------------------------")

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")

    logits = model(inputs).logits
    
    
    predictions = torch.argmax(logits, dim=-1)
    
    # print(f"Input: {inputs}")
    # print(f"Logits: {logits}")
    # print(f"Predictions: {predictions.item()}")
    print(f"{text} - {id2label[predictions.item()]}")


Trained model predictions:
---------------------------
A cinematic masterpiece! 10/10 would watch again. - POSITIVE
Underwhelming experience. The plot was predictable. - NEGATIVE
The special effects were mind-blowing, but the story was lacking. - NEGATIVE
I loved the cast, but the pacing was slow. - NEGATIVE
A beautifully shot film with a compelling narrative. - POSITIVE
Not my cup of tea, but I can see why others would enjoy it. - POSITIVE
The dialogue was witty and the characters were well-developed. - POSITIVE
A disappointing sequel that didn't live up to the original. - NEGATIVE
A thought-provoking film that left me questioning reality. - NEGATIVE
A fun, lighthearted movie perfect for a night in. - POSITIVE
i hate this shit - NEGATIVE
