In [None]:
!pip install peft
!pip install --upgrade ipywidgets jupyter notebook

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    DataCollatorWithPadding, TrainingArguments , Trainer
)
from peft import PeftConfig, PeftModel, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [20]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [67]:
imdb_dataset = load_dataset("imdb")
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Dataset

In [68]:
from datasets import load_dataset
train_val_split = imdb_dataset["train"].train_test_split(test_size=0.15, seed=42)

imdb_dataset = {
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": imdb_dataset["test"]
}
imdb_dataset

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 21250
 }),
 'validation': Dataset({
     features: ['text', 'label'],
     num_rows: 3750
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 })}

In [81]:
imdb_dataset["train"] = imdb_dataset["train"].shuffle().select(range(300))
imdb_dataset["test"] = imdb_dataset["test"].shuffle().select(range(100))
imdb_dataset["validation"] = imdb_dataset["validation"].shuffle().select(range(100))
imdb_dataset["train"]


Dataset({
    features: ['text', 'label'],
    num_rows: 300
})

In [10]:
imdb_test

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [70]:
imdb_dataset

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 1000
 }),
 'validation': Dataset({
     features: ['text', 'label'],
     num_rows: 3750
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 100
 })}

In [82]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2 ,id2label=id2label,
    label2id=label2id)
model.to(device)

# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]
    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# tokenize training and validation datasets
# Apply tokenization to each split
tokenized_dataset = {
    "train": imdb_dataset["train"].map(tokenize_function, batched=True, batch_size=32),
    "validation": imdb_dataset["validation"].map(tokenize_function, batched=True, batch_size=32),
    "test": imdb_dataset["test"].map(tokenize_function, batched=True, batch_size=32)
}

tokenized_dataset

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 2072.97 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1464.68 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1294.83 examples/s]


{'train': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 300
 }),
 'validation': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 100
 }),
 'test': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 100
 })}

In [83]:
tokenized_dataset[('train')].remove_columns(["text"])
tokenized_dataset[('validation')].remove_columns(["text"])
tokenized_dataset[('test')].remove_columns(["text"])
#tokenized_dataset = {split: dataset.remove_columns(["text"]) for split, dataset in tokenized_dataset.items()}


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 100
})

### Evaluation Metrics

In [84]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

# define an evaluation function to pass into trainer later
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)

    # return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="binary")["f1"]
    }

### testing the untrained model

In [85]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
for text in text_list:
    tokens = tokenizer(text , return_tensors="pt")
    logit = model(**tokens).logits
    print("logit: ", logit)
    predictions = torch.argmax(logit, dim=1)
    predicted_label = id2label[predictions.item()]

    print(f"{text} - {predicted_label}")

logit:  tensor([[0.0724, 0.0737]], grad_fn=<AddmmBackward0>)
It was good. - Positive
logit:  tensor([[0.0656, 0.0812]], grad_fn=<AddmmBackward0>)
Not a fan, don't recommed. - Positive
logit:  tensor([[0.0946, 0.0833]], grad_fn=<AddmmBackward0>)
Better than the first one. - Negative
logit:  tensor([[0.0766, 0.0968]], grad_fn=<AddmmBackward0>)
This is not worth watching even once. - Positive
logit:  tensor([[0.0917, 0.0947]], grad_fn=<AddmmBackward0>)
This one is a pass. - Positive


### Train Model

In [86]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [87]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [88]:
from transformers import get_scheduler, AutoProcessor
from torch.optim import AdamW

training_args = TrainingArguments(
    output_dir=checkpoint + "-lora-text-classification",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,   # Optional: applies L2 regularization
    warmup_ratio=0,  # Keep warmup steps at 0
    # logging_steps=10,  # Logging frequency
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

num_training_steps = (len(tokenized_dataset["train"]) // training_args.per_device_train_batch_size) * training_args.num_train_epochs
# optimizer = SGD(model.parameters(), lr=5e-3, momentum=0.9)
optimizer = AdamW(model.parameters(), lr=5e-5)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,

)

processor = AutoProcessor.from_pretrained("bert-base-uncased")

# create trainer object

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=processor,  # Use processing_class instead of tokenizer
    optimizers=(optimizer, lr_scheduler),  # Pass manually defined optimizer & scheduler
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
for text in text_list:
    tokens = tokenizer(text , return_tensors="pt")
    logit = model(**tokens).logits
    print("logit: ", logit)
    predictions = torch.argmax(logit, dim=1)
    predicted_label = id2label[predictions.item()]

    print(f"{text} - {predicted_label}")

### OPTIONAL: Push Model to HUb

In [None]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login()
hf_name = 'talibdaryabi' # your hf username or org name
model_id = hf_name + "/" + checkpoint + "-lora-text-classification" # you can name the model whatever you want
model.push_to_hub(model_id) # save model
trainer.push_to_hub(model_id)