Install Libraries

In [None]:
# Make sure you've run this and are on a GPU runtime
!pip install transformers datasets bitsandbytes accelerate evaluate



Load FP32 (Normal) Model

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset

# --- THIS IS THE IMPORTANT CHANGE ---
# We use a model ALREADY fine-tuned on IMDB.
# This way, we get a real accuracy score to compare.
model_name = "textattack/distilbert-base-uncased-imdb"

# 1. Load the original FP32 model (for Thalha's comparison)
# This model will no longer give the "not initialized" warning.
model_fp32 = AutoModelForSequenceClassification.from_pretrained(model_name)

print("--- Original FP32 Model (Fine-tuned on IMDB) ---")
# Notice it will now have 2 labels (positive/negative)
print(model_fp32.config)

# 2. Define the 4-bit quantization configuration (Your Task)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# 3. Load YOUR INT4 model (Your Task)
model_int4 = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"  # 'auto' puts it on the GPU
)

print("\n--- Quantized INT4 Model ---")
# Look at the layers, you'll see 'Linear4bit'
print(model_int4)

# 4. Load the tokenizer (we only need one)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 5. Load the IMDB dataset subset
imdb = load_dataset("imdb")
# We'll use a small subset for quick evaluation
# Note: The 'textattack' model was trained on the 'train' split,
# so we MUST use the 'test' split for a fair evaluation.
test_dataset = imdb["test"].shuffle(seed=42).select(range(1000))

print(f"\nLoaded {len(test_dataset)} testing examples.")

--- Original FP32 Model (Fine-tuned on IMDB) ---
DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "dtype": "float32",
  "finetuning_task": "imdb",
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.57.1",
  "vocab_size": 30522
}


--- Quantized INT4 Model ---
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Tran

Load the Dataset


In [None]:
# --- Step 1: Install everything ---
!pip install transformers datasets bitsandbytes accelerate evaluate

# --- Step 2: Import everything ---
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import numpy as np
import evaluate

# --- Step 3: Define model name and load tokenizer/dataset ---
model_name = "textattack/distilbert-base-uncased-imdb"

print("Reloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Reloading dataset...")
imdb = load_dataset("imdb")
test_dataset = imdb["test"].shuffle(seed=42).select(range(1000))

# --- Step 4: Define the 4-bit config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# --- Step 5: Re-load YOUR INT4 model ---
print("Reloading the INT4 model (model_int4)...")
model_int4 = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

print("\n--- FIX COMPLETE! ---")
print("All variables (model_int4, tokenizer, test_dataset) are back in memory.")

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, evaluate
Successfully installed bitsandbytes-0.48.2 evaluate-0.4.6
Reloading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Reloading dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Reloading the INT4 model (model_int4)...


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


--- FIX COMPLETE! ---
All variables (model_int4, tokenizer, test_dataset) are back in memory.


Preprocess the Dataset

In [None]:
# This function converts text into model-readable "input_ids"
def tokenize_function(examples):
    # padding="max_length" makes all sentences the same length
    # truncation=True cuts off sentences that are too long
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply this function to our entire test dataset
print("Tokenizing the test dataset...")
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
print("Tokenizing complete!")

Tokenizing the test dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing complete!


Set up for Manual Evaluation

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # This adds a nice progress bar
import evaluate      # To calculate accuracy

# 1. Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# 2. Tell the dataset to output PyTorch tensors
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 3. Create the DataLoader to feed us batches of 16
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=16)

print("DataLoader is ready!")

Downloading builder script: 0.00B [00:00, ?B/s]

DataLoader is ready!


Run the Evaluation!

In [None]:
print("--- 📊 Starting manual evaluation for INT4 (bitsandbytes) model... ---")

# 1. Put the model in "evaluation mode" (this is good practice)
model_int4.eval()

# 2. Loop through every batch in our DataLoader
for batch in tqdm(test_dataloader):

    # 3. Move the batch of data to the GPU ("cuda")
    batch = {k: v.to("cuda") for k, v in batch.items()}

    # 4. We don't need to calculate gradients, so we use torch.no_grad()
    # This saves memory and is faster
    with torch.no_grad():
        # 5. Get the model's predictions for the batch
        outputs = model_int4(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])

    # 6. Get the raw prediction scores (logits)
    logits = outputs.logits

    # 7. Find the winning prediction (0 or 1) by finding the highest score
    predictions = torch.argmax(logits, dim=-1)

    # 8. Add this batch's results to our accuracy calculator
    accuracy_metric.add_batch(predictions=predictions.cpu(), references=batch["label"].cpu())

# 9. After the loop is finished, calculate the final accuracy
final_accuracy_results = accuracy_metric.compute()

print("\n--- Evaluation Complete! ---")
print(f"Final Accuracy: {final_accuracy_results['accuracy']}")

--- 📊 Starting manual evaluation for INT4 (bitsandbytes) model... ---


100%|██████████| 63/63 [00:03<00:00, 17.91it/s]


--- Evaluation Complete! ---
Final Accuracy: 0.924





In [None]:
import time
import numpy as np
import torch
from tqdm import tqdm  # For our progress bar

# --- 1. Get a Single Sample for Testing ---
# (We assume 'tokenized_test_dataset' is still in memory from Day 2
# and is already formatted as torch tensors)
try:
    sample = tokenized_test_dataset[0]
except NameError:
    print("Error: 'tokenized_test_dataset' not found.")
    print("Please re-run the 'Fix' block and the Day 2 'Preprocess' cell, then try this again.")
    # This stops the script if the data isn't loaded
    raise

# The model expects a "batch", so we add a new dimension (batch_size=1)
# using .unsqueeze(0) and move it to the GPU
inputs = {
    "input_ids": sample["input_ids"].unsqueeze(0).to("cuda"),
    "attention_mask": sample["attention_mask"].unsqueeze(0).to("cuda")
}
print("Test input sample is ready and on the GPU.")


# --- 2. Warm-up Runs ---
print("Running 2 warm-up predictions...")
model_int4.eval()  # Put model in evaluation mode
with torch.no_grad():
    _ = model_int4(**inputs)
    _ = model_int4(**inputs)
print("Warm-up complete.")


# --- 3. The Latency Test Loop ---
num_predictions = 100
timings = []  # We'll store all 100 timings here

print(f"--- Running latency test ({num_predictions} predictions) ---")

for _ in tqdm(range(num_predictions)):

    # 1. Wait for GPU to be ready
    torch.cuda.synchronize()

    # 2. Start timer
    start_time = time.perf_counter()

    # 3. Run prediction
    with torch.no_grad():
        _ = model_int4(**inputs)

    # 4. Wait for GPU to finish the prediction
    torch.cuda.synchronize()

    # 5. Stop timer
    end_time = time.perf_counter()

    # 6. Record the time (in milliseconds)
    duration_ms = (end_time - start_time) * 1000
    timings.append(duration_ms)

# --- 4. Calculate and Print Results ---
avg_latency = np.mean(timings)

print(f"\n--- Latency Test Complete ---")
print(f"Average Latency: {avg_latency:.2f} ms per prediction")

Test input sample is ready and on the GPU.
Running 2 warm-up predictions...
Warm-up complete.
--- Running latency test (100 predictions) ---


100%|██████████| 100/100 [00:01<00:00, 93.56it/s]


--- Latency Test Complete ---
Average Latency: 10.57 ms per prediction





### BitsAndBytes (BnB) Quantization Results

| Model Configuration | Accuracy | Avg. Latency (ms) |
| :--- | :---: | :---: |
| FP32 (Baseline) | [Get from Thalha] | [Get from Thalha] |
| INT8 (BnB) | [Get from Thalha] | [Get from Thalha] |
| **INT4 (BnB)** | **0.924** | **10.57 ms per prediction** |