In [1]:
import torch
import transformers
from transformers import pipeline
from transformers import DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer
from optimum.quanto import freeze, quantize, qint8, WeightQBytesTensor
import datasets
from transformers import TrainingArguments
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
2025-02-25 22:01:09.110226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740517269.163631   38436 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740517269.178172   38436 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-25 22:01:09.294712: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_id = "distilbert-base-uncased"
#model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
dataset = datasets.load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [4]:
def preprocess(data):
    tokens = tokenizer(data["text"], truncation=True, padding = 'max_length',  max_length=512)
    tokens["label"] = data["label"]
    return tokens

In [5]:
tokens = dataset.map(preprocess, batched = True)

In [6]:
labels = tokens['train'].features['label'].names
num_labels = len(labels)
label2id, id2label = {}, {}

for idx, lbl in enumerate(labels):
    label2id[lbl] = idx
    id2label[idx] = lbl

In [7]:
small_train_dataset = tokens["train"].shuffle(seed=11).select(range(2000))
small_eval_dataset = tokens["train"].shuffle(seed=11).select(range(2000))

In [8]:
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification, DistilBertConfig, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels = num_labels,
    id2label = id2label,
    label2id = label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def preprocess_logits_for_metrics(logits, labels):
    """
    Preprocess the logits to ensure they are in the correct format for metric computation.
    This function will be called during the evaluation process.
    """
    if isinstance(logits, tuple):  
        logits = logits[0]  # get logit tensors

    pred_ids = torch.argmax(logits, dim=-1)
    
    return pred_ids, labels
    
def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred

    return accuracy.compute(predictions=predictions[0], references=labels)


In [10]:
from optimum.quanto import freeze, quantize, qint8
quantize(model, weights=qint8, activations=None)

In [11]:
tokens = tokens.remove_columns(["text"])
tokens = tokens.remove_columns(["attention_mask"])
tokens = tokens.rename_column("label", "labels")
tokens.set_format("torch")

In [12]:
small_train_dataset = tokens["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokens["test"].shuffle(seed=42).select(range(1000))

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
#train_dataloader = DataLoader(small_train_dataset, shuffle=True)

eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [14]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [15]:
from transformers import get_scheduler

num_epochs = 3

num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(

    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps

)

In [16]:
import torch

from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)

model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): QLinear(in_features=768, out_features=768, bias=True)
            (k_lin): QLinear(in_features=768, out_features=768, bias=True)
            (v_lin): QLinear(in_features=768, out_features=768, bias=True)
            (out_lin): QLinear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=Fal

In [17]:
from tqdm.auto import tqdm
from torch import nn

progress_bar = tqdm(range(num_training_steps))
model.train()
for batch in train_dataloader:
   #print(batch)
   target, data = batch.items()
   optimizer.zero_grad()
   output = model(**batch)
   #print(output)
   #loss = torch.nn.functional.nll_loss(output.logits, target[1])
   loss = output.loss
   #loss = nn.CrossEntropyLoss(output.logits.view(-1, num_labels), labels.view(-1))
   torch.autograd.backward(loss)
   #loss.backward()
   optimizer.step()
   progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        print("output: ", outputs)
        #loss = outputs.loss
        loss = torch.nn.functional.nll_loss(outputs.logits, batch.values()[0])
        print("loss: ", type(loss))
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

  0%|          | 0/375 [01:34<?, ?it/s]


output:  SequenceClassifierOutput(loss=tensor(0.6621, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1399,  0.0282],
        [ 0.1384,  0.0040],
        [ 0.1384, -0.0198],
        [ 0.1291,  0.0515],
        [ 0.0633,  0.1325],
        [ 0.1970,  0.0188],
        [ 0.2034,  0.0884],
        [ 0.1175,  0.0742]], grad_fn=<WeightQBytesLinearFunctionBackward>), hidden_states=None, attentions=None)


TypeError: 'dict_values' object is not subscriptable

In [None]:
print(model)


In [None]:
print(model.distilbert.transformer.layer[0].attention.q_lin.state_dict())

In [None]:
print(tokens["train"]["input_ids"][0].dtype)

In [None]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(model(**batch))
    break

In [None]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(qmodel(**batch))
    break

In [None]:
freeze(model)

In [None]:
print("acc after freeze:", trainer.evaluate())

quantize(model)
trainer.train()
freeze(model)
trainer.evaluate()