# DistilRoBERTa + LoRA — IMDB Sentiment Classifier
**Goal:** fine-tune a DistilRoBERTa model with LoRA adapters (PEFT) on the IMDB dataset.  


In [6]:
!pip install -q --upgrade pip
!pip install -q transformers datasets evaluate peft accelerate bitsandbytes wandb sentencepiece
!pip install -q gradio

import torch
print("torch:", torch.__version__, "cuda available", torch.cuda.is_available())
!nvidia-smi -L || true

torch: 2.8.0+cu126 cuda available False
/bin/bash: line 1: nvidia-smi: command not found


In [7]:
import os
import random
import json
from pathlib import Path
from datetime import datetime

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset, DatasetDict
import evaluate


def set_seed(seed: int =42):
  random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [8]:
MODEL_NAME = "distilroberta-base"
OUTPUT_DIR = "/content/results/distilroberta-lora-" + datetime.now().strftime("%Y%m%d-%H%M%S")
os.makedirs(OUTPUT_DIR, exist_ok=True)

#Experiment settings
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 2
LR = 2E-4
SEED = 42
USE_SMALL_SUBSET = True
SMALL_TRAIN_SAMPLES = 2000
SMALL_EVAL_SAMPLES = 1000




In [9]:
raw = load_dataset("imdb")
raw

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
raw['train'][:3]

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

## Tokenizer & preprocessing


In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN)


if USE_SMALL_SUBSET:
  small_train  = raw["train"].shuffle(seed=SEED).select(range(SMALL_TRAIN_SAMPLES))
  small_test = raw["test"].shuffle(seed=SEED).select(range(SMALL_EVAL_SAMPLES))
  datasets = DatasetDict({"train":small_train, "test":small_test})
else:
  datasets = raw

tokenized = datasets.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type="torch")
tokenized

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [12]:
print("train samples:", len(tokenized["train"]))
print("test samples:", len(tokenized["test"]))
#

example = tokenizer.decode(tokenized["train"][0]["input_ids"],skip_special_tokens=True)
print("example text(truncated):", example[:500])

train samples: 2000
test samples: 1000
example text(truncated): There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks Am


In [13]:
#load a sequence classification model but freeze base params and later attach LoRA.
num_labels = 2
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
base_model
#

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

## setup LoRA

In [14]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# for bitsandbytes / 8-bit, we will  call prepare_model_for_kbit_training(base_model),
# but for this simple demo we'll keep full precision (Colab CPU/float32).
# for 8-bit later, use tis as the next line: base_model = prepare_model_for_kbit_training(base_model)

lora_cofig = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v", "k", "o", "query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, lora_cofig)
#numbre of trainable parameters

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"trainable params: {trainable_params} / {total_params} {100 * trainable_params / total_params:.3f}%")


trainable params: 739586 / 82859524 0.893%


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = logits.argmax(-1)
  acc = metric_acc.compute(predictions=preds, references=labels)
  f1 = metric_f1.compute(predictions=preds, references=labels, average="binary")
  return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

In [19]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    # fp16=torch.cuda.is_available(), # Removed as CUDA is not available
    report_to="none", # Explicitly disable wandb logging
)

#create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [20]:
#training

set_seed(SEED)
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
metrics = train_result.metrics
metrics["train_samples"] = len(tokenized["train"])
print("Training metrics:", metrics)

with open(os.path.join(OUTPUT_DIR, "train_metrics.json"), "w") as f:
  json.dump(metrics, f, indent=2)



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5098,0.30145,0.867,0.864975
2,0.3142,0.2883,0.874,0.874502




Training metrics: {'train_runtime': 4577.0317, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.055, 'total_flos': 269478813696000.0, 'train_loss': 0.4090772476196289, 'epoch': 2.0, 'train_samples': 2000}


In [21]:
# Save only adapter weights (PEFT)
adapter_dir = os.path.join(OUTPUT_DIR, "lora_adapter")
model.save_pretrained(adapter_dir)
print("Saved LoRA adapter to", adapter_dir)

Saved LoRA adapter to /content/results/distilroberta-lora-20251018-084737/lora_adapter


In [22]:
# ---------- Eval ----------
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)

# Show some sample predictions
def predict_texts(texts):
    enc = tokenizer(texts, truncation=True, max_length=MAX_LEN, padding=True, return_tensors="pt")
    enc = {k: v.to(trainer.model.device) for k, v in enc.items()}
    with torch.no_grad():
        logits = trainer.model(**enc).logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy()
    return preds

samples = [
    "This movie was fantastic — I loved every minute of it!",
    "A boring and long film with nothing to offer.",
    "It had some good moments but overall it felt flat."
]
print("Preds:", predict_texts(samples))
for t, p in zip(samples, predict_texts(samples)):
    print(p, "->", t)




Eval metrics: {'eval_loss': 0.28830036520957947, 'eval_accuracy': 0.874, 'eval_f1': 0.8745019920318725, 'eval_runtime': 361.551, 'eval_samples_per_second': 2.766, 'eval_steps_per_second': 0.089, 'epoch': 2.0}
Preds: [1 0 1]
1 -> This movie was fantastic — I loved every minute of it!
0 -> A boring and long film with nothing to offer.
1 -> It had some good moments but overall it felt flat.


In [25]:
import gradio as gr

def gradio_predict(text):
    label = predict_texts([text])[0]
    return "positive" if label == 1 else "negative"

iface = gr.Interface(fn=gradio_predict, inputs="text", outputs="text", title="IMDB Sentiment — DistilRoBERTa+LoRA")
# Launch in notebook – in Colab this will give a public link
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9701894b7044832d20.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


