# Libaries

In [1]:
pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import default_data_collator
import time

# Load the Teacher Model and the Student Model

In [3]:
# Load the pre-trained teacher model
teacher_model_name = "textattack/bert-base-uncased-imdb"
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name)
teacher_model.eval()  # Set the model to evaluation mode since it is not being trained directly

# Load the pre-trained student model
student_model_name = "nreimers/MiniLM-L6-H384-uncased"
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_name)
tokenizer = AutoTokenizer.from_pretrained(student_model_name)  # Load the tokenizer corresponding to the student model

# Load imdb data
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nreimers/MiniLM-L6-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Subsample and Shuffle the IMDB Test Dataset

In [4]:
# Reduce the test set size to 2000 samples for faster experimentation
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(2000))

# Tokenize the input text using the pre-trained tokenizer.
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    # Extracts the text from the dataset
    # Truncate sequences longer than max_length
    # Pad shorter sequences to max_length
    # Define the maximum token length

# Tokenize dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)
# Specify that the dataset should be formatted as PyTorch tensors.
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Define DataLoaders for Training and Testing

In [5]:
# Create a DataLoader for the training dataset
train_dataloader = DataLoader(
    encoded_dataset["train"],  # Load the pre-processed training dataset
    batch_size=8,  # Define the number of samples per batch (smaller batches fit better on GPU)
    shuffle=True  # Shuffle the data at every epoch to improve generalization
)

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(
    encoded_dataset["test"],  # Load the pre-processed test dataset
    batch_size=8  # Use the same batch size for evaluation (no need to shuffle test data)
    # Shuffling is disabled since test data should be evaluated in a fixed order
)

# Define Knowledge Distillation Loss Function

In [6]:
# Define Knowledge Distillation Loss
class DistillationLoss(nn.Module):
    """
    Implements Knowledge Distillation Loss, which helps transfer knowledge
    from a large teacher model to a smaller student model.

    The loss consists of:
    1. Soft loss (KL Divergence between softened teacher and student predictions).
    2. Hard loss (Standard cross-entropy loss with ground truth labels).

    Args:
        temperature (float): Controls the softness of logits before KL divergence.
                            Higher values produce softer probability distributions.
        alpha (float): Balances between distillation loss and standard cross-entropy loss.
                      - alpha closer to 1 → More focus on distillation loss.
                      - alpha closer to 0 → More focus on true label supervision.
    """
    def __init__(self, temperature=2.0, alpha=0.5):
        super(DistillationLoss, self).__init__()
        self.temperature = temperature      # Controls the softening of logits
        self.alpha = alpha                  # Weighting factor between soft and hard losses
        self.kl_loss = nn.KLDivLoss(reduction="batchmean")  # KL Divergence loss for soft target distillation
        self.ce_loss = nn.CrossEntropyLoss()                # Standard Cross-Entropy loss for true labels

    def forward(self, student_logits, teacher_logits, true_labels):
        """
        Computes the total distillation loss.

        Args:
            student_logits (torch.Tensor): Logits produced by the student model.
            teacher_logits (torch.Tensor): Logits produced by the teacher model.
            true_labels (torch.Tensor): Ground truth labels for classification.

        Returns:
            torch.Tensor: The combined loss, which is a weighted sum of:
                          - Soft loss (KL Divergence between teacher and student predictions).
                          - Hard loss (Cross-Entropy between student predictions and ground truth labels).
        """

        # Convert teacher logits into softened probability distribution using temperature scaling
        soft_targets = nn.functional.softmax(teacher_logits / self.temperature, dim=-1)

        # Apply log-softmax to student logits with the same temperature
        student_log_probs = nn.functional.log_softmax(student_logits / self.temperature, dim=-1)

        # Compute KL Divergence between the softened teacher and student outputs
        soft_loss = self.kl_loss(student_log_probs, soft_targets) * (self.temperature ** 2)

        # Standard cross-entropy loss between student predictions and true labels
        hard_loss = self.ce_loss(student_logits, true_labels)

        # Combine soft and hard losses using alpha as the weighting factor
        return self.alpha * soft_loss + (1 - self.alpha) * hard_loss

# Define the Training Function for the Student Model

In [7]:
# Training function
def train_student(student_model, teacher_model, train_dataloader, epochs=3, lr=5e-5):
    """
    Trains a student model using Knowledge Distillation.

    The student model learns from both:
    1. Soft labels provided by the teacher model (logits).
    2. Hard labels from the true dataset labels.

    Args:
        student_model (torch.nn.Module): The smaller, more efficient student model.
        teacher_model (torch.nn.Module): The larger, pre-trained teacher model.
        train_dataloader (torch.utils.data.DataLoader): DataLoader for training data.
        epochs (int): Number of training epochs (default: 3).
        lr (float): Learning rate for the optimizer (default: 5e-5).

    Returns:
        None (trains the student model in-place).
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    student_model.to(device)  # Move student model to the selected device
    teacher_model.to(device)  # Move teacher model to the selected device

    # Use AdamW optimizer, which is well-suited for transformer-based models
    optimizer = optim.AdamW(student_model.parameters(), lr=lr)

    # Define the Knowledge Distillation loss function with:
    # - Temperature scaling to soften logits
    # - Alpha to balance between distillation loss and standard cross-entropy loss
    loss_fn = DistillationLoss(temperature=2.0, alpha=0.5)

    student_model.train()   # Enable training mode for the student model
    teacher_model.eval()    # Keep the teacher model frozen (no weight updates)

    for epoch in range(epochs):
        total_loss = 0      # Accumulate loss over batches
        for batch in train_dataloader:
            # Move batch data (inputs & labels) to the selected device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Use the teacher model to generate logits, but do not compute gradients
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask).logits

            # Forward pass through the student model
            student_outputs = student_model(input_ids, attention_mask=attention_mask).logits

            # Compute the knowledge distillation loss
            loss = loss_fn(student_outputs, teacher_outputs, labels)

            optimizer.zero_grad()  # Reset gradients to avoid accumulation
            loss.backward()        # Compute gradients for student model parameters
            optimizer.step()       # Update student model weights

            # Accumulate total loss for epoch-level monitoring
            total_loss += loss.item()

        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_dataloader):.4f}")   # Compute average loss per batch

# Function to Measure Model Performance

    Measures key performance metrics of a given model, including:
    - Model storage size in megabytes (MB).
    - Inference speed (average latency per sample in seconds).
    - Model accuracy on a given dataset.

    Args:
        model (torch.nn.Module): The neural network model to evaluate.
        dataset (torch.utils.data.Dataset): The dataset for evaluation.

    Returns:
        dict: A dictionary containing:
            - "model_size_mb" (float): Model size in MB.
            - "avg_latency_seconds" (float): Average inference time per sample.
            - "accuracy" (float): Model classification accuracy.

In [8]:
# Measure model storage size, inference speed, and accuracy
def measure_performance(model, dataset):
    model.to(device)
    model.eval()

    # Compute model storage size (MB)
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())  # Calculate the total size of all model parameters (weights)
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())    # Calculate the total size of model buffers
    model_size_mb = (param_size + buffer_size) / (1024 ** 2)                    # Convert the total model size from bytes to megabytes (MB)

    # Ensure correct DataLoader format
    dataloader = DataLoader(dataset, batch_size=8, collate_fn=default_data_collator)  # Process 8 samples per batch

    # Initialize metrics for inference time, accuracy calculation
    total_time, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for batch in dataloader:
            # Move input tensors to the model's device (GPU)
            inputs = {
                "input_ids": batch["input_ids"].to(model.device),  # Match model's device
                "attention_mask": batch["attention_mask"].to(model.device),
            }

            # Move labels to the same device as the model
            labels = batch["labels"].to(model.device)

            # Measure inference time per batch
            start_time = time.time()
            outputs = model(**inputs)               # Forward pass through the model
            total_time += time.time() - start_time  # Accumulate total time

            # Obtain predicted class labels
            predictions = outputs.logits.argmax(dim=-1)

            # Count correct predictions
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_latency = total_time / total   # Average inference time per sample
    accuracy = correct / total         # Compute accuracy

    # Return performance metrics as a dictionary
    return {"model_size_mb": model_size_mb, "avg_latency_seconds": avg_latency, "accuracy": accuracy}

# Function to Compare the Performance of Different Models

    Compares the original model with a set of optimized models by evaluating:
    - Model size reduction (percentage decrease in storage size).
    - Inference speed improvement (speedup factor).
    - Accuracy retention (accuracy ratio compared to the original model).

    Args:
        original_model (torch.nn.Module): The baseline (original) model.
        optimized_models (dict): A dictionary of optimized models, where:
            - Key (str): The name of the optimized model (e.g., "quantized", "pruned").
            - Value (torch.nn.Module): The optimized model instance.
        dataset (torch.utils.data.Dataset): The dataset for model evaluation.

    Returns:
        dict: A dictionary containing:
            - "original": Performance metrics of the original model.
            - Each optimized model's name as a key with its corresponding metrics:
                - Model size in MB
                - Average inference latency per sample
                - Accuracy
                - Size reduction percentage
                - Speedup factor
                - Accuracy retention ratio

In [9]:
def compare_models(original_model, optimized_models, dataset):
    # Initialize a dictionary to store the results for all models
    results = {}

    print("Evaluating original model...")
    original_metrics = measure_performance(original_model, dataset)

    # Store original model performance results
    results["original"] = original_metrics

    for name, model in optimized_models.items():
        print(f"Evaluating {name}...")
        metrics = measure_performance(model, dataset)  # Measure the performance of the optimized model

        # Compute relative improvements compared to the original model
        rel_metrics = {
            "size_reduction": 1 - (metrics["model_size_mb"] / original_metrics["model_size_mb"]),
            "speedup": original_metrics["avg_latency_seconds"] / metrics["avg_latency_seconds"],
            "accuracy_retention": metrics["accuracy"] / original_metrics["accuracy"]
        }

        # Store metrics in the results dictionary
        results[name] = {**metrics, **rel_metrics}

    return results

# Train the student model

In [10]:
# Train the student model using Knowledge Distillation
train_student(student_model, teacher_model, train_dataloader)

Epoch 1: Loss = 0.8665
Epoch 2: Loss = 0.5597
Epoch 3: Loss = 0.3390


# Model Performance Evaluation

**Run 10 times**

## 1.

In [11]:
# Check if a CUDA-compatible GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the original model for comparison
original_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name).to(device)

# Ensure optimized_model is a dictionary
optimized_model = {"student_model": student_model}

# Run model comparison
results1 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results1.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Using device: cuda
Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0010
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.0163
  accuracy_retention: 0.9568


## 2.

In [12]:
# Run model comparison
results2 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results2.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0011
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.0580
  accuracy_retention: 0.9568


## 3.

In [13]:
# Run model comparison
results3 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results3.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0010
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.0358
  accuracy_retention: 0.9568


## 4.

In [14]:
# Run model comparison
results4 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results4.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0011
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0006
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 1.7501
  accuracy_retention: 0.9568


## 5.

In [15]:
# Run model comparison
results5 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results5.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0017
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0011
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 1.6277
  accuracy_retention: 0.9568


## 6.

In [16]:
# Run model comparison
results6 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results6.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0015
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.7577
  accuracy_retention: 0.9568


## 7.

In [17]:
# Run model comparison
results7 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results7.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0010
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0006
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 1.5949
  accuracy_retention: 0.9568


## 8.

In [18]:
# Run model comparison
results8 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results8.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0010
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.0015
  accuracy_retention: 0.9568


## 9.

In [19]:
# Run model comparison
results9 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results9.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0015
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.8201
  accuracy_retention: 0.9568


## 10.

In [20]:
# Run model comparison
results10 = compare_models(original_model, optimized_model, encoded_dataset["test"])

# Print results
for model_name, metrics in results10.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

Evaluating original model...
Evaluating student_model...

Model: original
  model_size_mb: 417.6553
  avg_latency_seconds: 0.0011
  accuracy: 0.8920

Model: student_model
  model_size_mb: 86.6548
  avg_latency_seconds: 0.0005
  accuracy: 0.8535
  size_reduction: 0.7925
  speedup: 2.1219
  accuracy_retention: 0.9568


## Get Average

In [25]:
import pandas as pd

# Sample structure assuming results1 to results10 exist
results_list = [results1, results2, results3, results4, results5, results6, results7, results8, results9, results10]

# Convert results into a structured format
data = []
for result in results_list:
    for model_name, metrics in result.items():
        row = {'Model': model_name}
        row.update(metrics)
        data.append(row)

# Create a DataFrame
df = pd.DataFrame(data)

# Compute mean values for each model
mean_results = df.groupby('Model').mean(numeric_only=True).reset_index()

# Display the results
from IPython.display import display
display(mean_results)

Unnamed: 0,Model,model_size_mb,avg_latency_seconds,accuracy,size_reduction,speedup,accuracy_retention
0,original,417.655281,0.0012,0.892,,,
1,student_model,86.654793,0.000593,0.8535,0.792521,2.078401,0.956839
