In [None]:
import random

import numpy as np
import torch
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [2]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model: {model_name}, Device: {device}")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model: bert-base-uncased, Device: cpu


In [3]:
dataset = load_dataset("ag_news")

N_TRAIN = 5000
N_TEST = 1000

train_texts = dataset["train"]["text"][:N_TRAIN]
train_labels = dataset["train"]["label"][:N_TRAIN]
test_texts = dataset["test"]["text"][:N_TEST]
test_labels = dataset["test"]["label"][:N_TEST]

label_names = ["World", "Sports", "Business", "Sci/Tech"]
print(f"Train: {len(train_texts)}, Test: {len(test_texts)}")
print(f"Classes: {label_names}")

Train: 5000, Test: 1000
Classes: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
def get_cls_embeddings(texts, model, tokenizer, device, batch_size=32):
    """Extract CLS embeddings from BERT for a list of texts."""
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_emb)

    return np.vstack(embeddings)

In [5]:
print("Extracting train embeddings...")
X_train = get_cls_embeddings(train_texts, model, tokenizer, device)
print(f"X_train shape: {X_train.shape}")

print("\nExtracting test embeddings...")
X_test = get_cls_embeddings(test_texts, model, tokenizer, device)
print(f"X_test shape: {X_test.shape}")

Extracting train embeddings...


100%|██████████| 157/157 [01:39<00:00,  1.57it/s]


X_train shape: (5000, 768)

Extracting test embeddings...


100%|██████████| 32/32 [00:20<00:00,  1.59it/s]

X_test shape: (1000, 768)





In [6]:
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, train_labels)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}\n")
print(classification_report(test_labels, y_pred, target_names=label_names))

Accuracy: 0.8580

              precision    recall  f1-score   support

       World       0.86      0.87      0.87       268
      Sports       0.94      0.93      0.94       274
    Business       0.73      0.80      0.76       205
    Sci/Tech       0.88      0.81      0.85       253

    accuracy                           0.86      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.86      0.86      0.86      1000

