In [1]:
pip install transformers datasets peft accelerate bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [2]:
#loading the dataset
from datasets import load_dataset
dataset = load_dataset('sms_spam')
print(dataset['train'][0])

{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0}


In [6]:
#tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["sms"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [7]:
print(tokenized_dataset["train"][0])

{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0, 'input_ids': [101, 2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 1041, 28305, 1012, 1012, 1012, 25022, 2638, 2045, 2288, 26297, 28194, 1012, 1012, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
#formating the dataset for finetuning/training

# Step 1: Rename 'label' column to 'labels' for Trainer compatibility
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Step 2: Format the dataset for PyTorch
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)


ValueError: Original column name label not in the dataset. Current columns in the dataset: ['sms', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [15]:
#loading the model and peft = lora
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2  # 2 classes for ham and spam
                                                          )

#LoRA Configs
peft_config = LoraConfig(
    r=8,                        
    lora_alpha=16,               
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,   
)

#wrap the model with LoRA 
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


In [16]:
#training configs
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./lora_spam_model",  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=200,
    report_to="none",
)

In [18]:
#data collator for padding batches
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
#trainer engine
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [22]:
#train/finetune the lora model to get the desired result/model
trainer.train()



Step,Training Loss
200,0.2604
400,0.0753
600,0.05
800,0.0369
1000,0.052




TrainOutput(global_step=1047, training_loss=0.09399825772902116, metrics={'train_runtime': 86566.8254, 'train_samples_per_second': 0.193, 'train_steps_per_second': 0.012, 'total_flos': 1103742924871680.0, 'train_loss': 0.09399825772902116, 'epoch': 3.0})

In [23]:
#saving the model
trainer.save_model("lora-spam-detector")

In [24]:
#saving the tokenizer
tokenizer.save_pretrained("lora-spam-detector")

('lora-spam-detector\\tokenizer_config.json',
 'lora-spam-detector\\special_tokens_map.json',
 'lora-spam-detector\\vocab.txt',
 'lora-spam-detector\\added_tokens.json',
 'lora-spam-detector\\tokenizer.json')

In [25]:
#loading the model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig


peft_config = PeftConfig.from_pretrained("lora-spam-detector")
base_model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, "lora-spam-detector")
tokenizer = AutoTokenizer.from_pretrained("lora-spam-detector")

model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default

In [26]:
#gradio demo
import gradio as gr
import torch

#prediction function
def classify_sms(message):
    inputs = tokenizer(
        message,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return "🚫 Spam" if predicted_class == 1 else "✅ Ham"

#building the UI
demo = gr.Interface(
    fn=classify_sms,
    inputs=gr.Textbox(label="Enter SMS Message"),
    outputs=gr.Label(label="Prediction"),
    title="📱 SMS Spam Classifier",
    description="Enter an SMS message below and see if it's spam or not, powered by LoRA + BERT."
)

# Launch the app
demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


