In [1]:
pip install transformers datasets scikit-learn peft torch

Note: you may need to restart the kernel to use updated packages.


## Load Datasets

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load IMDb dataset
dataset = load_dataset("imdb")

# Load tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove raw text columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

# Set format for PyTorch
tokenized_datasets.set_format(type='torch')

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Create Data Loaders and PEFT Implementation

In [3]:
from torch.utils.data import DataLoader

batch_size = 16
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [7]:
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig

# Load base model
model_name = "distilbert-base-uncased"  # Added model_name definition
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Print model structure to identify correct module names
# Uncomment this to debug the model structure
# for name, module in base_model.named_modules():
#     print(name)

# Configure LoRA with corrected target modules
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    # Corrected target modules to match DistilBERT's actual architecture
    target_modules=["q_lin", "k_lin", "v_lin"],  # Simplified pattern that matches DistilBERT's attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Inject LoRA
model = get_peft_model(base_model, lora_config)
model.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [5]:
for name, module in base_model.named_modules():
    print(name)


distilbert
distilbert.embeddings
distilbert.embeddings.word_embeddings
distilbert.embeddings.position_embeddings
distilbert.embeddings.LayerNorm
distilbert.embeddings.dropout
distilbert.transformer
distilbert.transformer.layer
distilbert.transformer.layer.0
distilbert.transformer.layer.0.attention
distilbert.transformer.layer.0.attention.dropout
distilbert.transformer.layer.0.attention.q_lin
distilbert.transformer.layer.0.attention.k_lin
distilbert.transformer.layer.0.attention.v_lin
distilbert.transformer.layer.0.attention.out_lin
distilbert.transformer.layer.0.sa_layer_norm
distilbert.transformer.layer.0.ffn
distilbert.transformer.layer.0.ffn.dropout
distilbert.transformer.layer.0.ffn.lin1
distilbert.transformer.layer.0.ffn.lin2
distilbert.transformer.layer.0.ffn.activation
distilbert.transformer.layer.0.output_layer_norm
distilbert.transformer.layer.1
distilbert.transformer.layer.1.attention
distilbert.transformer.layer.1.attention.dropout
distilbert.transformer.layer.1.attention.q

In [9]:
import torch
from torch.utils.data import Dataset

class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [14]:
# Prepare encodings for train and test datasets
train_texts = list(dataset['train']['text'])
train_labels = list(dataset['train']['label'])

test_texts = list(dataset['test']['text'])
test_labels = list(dataset['test']['label'])

# Tokenize
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=256)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=256)

In [38]:
import torch
from torch.utils.data import Dataset

class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert each item explicitly to numpy array or list
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

## Tokenization 

In [39]:
# Convert to lists
train_texts = list(dataset['train']['text'])[:10]
train_labels = list(dataset['train']['label'])[:10]

test_texts = list(dataset['test']['text'])[:10]
test_labels = list(dataset['test']['label'])[:10]

# Tokenize
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=256)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=256)

In [40]:
train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [41]:
from torch.utils.data import DataLoader

batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Model Training

In [42]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

epochs = 1

print("Starting training...")
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs} starting")
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        logits = outputs.logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        labels = batch['labels'].cpu().numpy()

        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Epoch {epoch+1} finished. Loss: {total_loss:.4f}, Accuracy: {epoch_acc:.4f}, F1 Score: {epoch_f1:.4f}")

Starting training...
Epoch 1/1 starting
Epoch 1 finished. Loss: 1.7837, Accuracy: 0.9000, F1 Score: 0.9474


In [43]:
# Assuming you've already tokenized using tokenizer
encodings = tokenizer(list(dataset['train']['text']), padding=True, truncation=True, max_length=256)
labels = list(dataset['train']['label'])

train_dataset = IMDbDataset(encodings, labels)

# For test
encodings_test = tokenizer(list(dataset['test']['text']), padding=True, truncation=True, max_length=256)
labels_test = list(dataset['test']['label'])

test_dataset = IMDbDataset(encodings_test, labels_test)

In [44]:
# Save the model with LoRA adapters
model.save_pretrained("lora_distilbert_sentiment")
tokenizer.save_pretrained("lora_distilbert_sentiment")
print("Model saved to 'lora_distilbert_sentiment/'")

Model saved to 'lora_distilbert_sentiment/'


## Testing

In [45]:
def predict_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_label = torch.argmax(probs, dim=-1).cpu().item()
        confidence = probs.max().cpu().item()
    label_str = "Positive" if pred_label == 1 else "Negative"
    return label_str, confidence

# Sample texts
texts = [
    "This movie was fantastic! I loved it.",
    "The film was boring and too long.",
    "An average movie, nothing special.",
    "I really enjoyed the story and acting."
]

for t in texts:
    label, conf = predict_text(t)
    print(f"Text: '{t}'\nPrediction: {label} (Confidence: {conf:.2f})\n")

Text: 'This movie was fantastic! I loved it.'
Prediction: Positive (Confidence: 0.83)

Text: 'The film was boring and too long.'
Prediction: Negative (Confidence: 0.94)

Text: 'An average movie, nothing special.'
Prediction: Negative (Confidence: 0.93)

Text: 'I really enjoyed the story and acting.'
Prediction: Positive (Confidence: 0.89)



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load your model for inference
from transformers import AutoTokenizer
from peft import PeftModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("lora_distilbert_sentiment")
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model = PeftModel.from_pretrained(base_model, "lora_distilbert_sentiment")
model.to(device)
model.eval()

# Prepare data
labels_true = list(dataset['test']['label'])
predicted_labels = []

for text in dataset['test']['text']:
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_label = torch.argmax(probs, dim=-1).cpu().item()
        predicted_labels.append(pred_label)

# Calculate metrics
accuracy = accuracy_score(labels_true, predicted_labels)
f1 = f1_score(labels_true, predicted_labels, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-Score: {f1:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Comparison Model without LoRA

In [56]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Load the pre-trained model (without fine-tuning)
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
sample_texts = [
    "This movie was fantastic! I loved it.",
    "The film was boring and too long.",
    "An average movie, nothing special.",
    "I really enjoyed the story and acting."
]

In [58]:
for text in sample_texts:
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = baseline_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        confidence, predicted_label_idx = torch.max(probs, dim=-1)
        predicted_label = "Positive" if predicted_label_idx.item() == 1 else "Negative"
        print(f"Text: '{text}'")
        print(f"Prediction: {predicted_label} (Confidence: {confidence.item():.2f})\n")

Text: 'This movie was fantastic! I loved it.'
Prediction: Negative (Confidence: 0.51)

Text: 'The film was boring and too long.'
Prediction: Negative (Confidence: 0.51)

Text: 'An average movie, nothing special.'
Prediction: Negative (Confidence: 0.52)

Text: 'I really enjoyed the story and acting.'
Prediction: Negative (Confidence: 0.53)

