In [None]:
!pip install transformers torch datasets onnx optimum


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [6]:
import torch
import torch.nn.utils.prune as prune

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name="weight", amount=0.5)


In [None]:
# Re-load the original model before quantization to avoid conflicts
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Now apply quantization
from torch.quantization import quantize_dynamic
quantized_model = quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)



In [None]:
!pip install datasets


In [None]:
import os
import torch
import torch.nn.functional as F
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# ✅ Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# ✅ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load teacher and student models
teacher_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased").to(device)
student_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased").to(device)

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# ✅ Define training arguments
training_args = TrainingArguments(
    output_dir="./distilled_model",
    num_train_epochs=1,  # Running 1 epoch for faster testing
    per_device_train_batch_size=16,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir="./logs",
)

# ✅ Load dataset
dataset = load_dataset("glue", "sst2")

# ✅ Tokenize dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# ✅ Custom Trainer for Distillation
class DistillationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").to(device)  # Move labels to correct device

        # ✅ Move inputs to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # ✅ Get teacher's predictions (soft labels)
        with torch.no_grad():
            teacher_outputs = teacher_model(**inputs)
            soft_labels = F.softmax(teacher_outputs.logits / 2.0, dim=-1)  # T=2.0

        # ✅ Student model's predictions
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits

        # ✅ Compute Knowledge Distillation loss
        loss_fn = torch.nn.KLDivLoss(reduction="batchmean")
        loss = loss_fn(F.log_softmax(student_logits / 2.0, dim=-1), soft_labels)

        return (loss, student_outputs) if return_outputs else loss

# ✅ Initialize Trainer with Distillation Loss
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# ✅ Train the student model
trainer.train()

In [None]:
# ✅ Save the trained student model
student_model.save_pretrained("./distilled_model")
tokenizer.save_pretrained("./distilled_model")
print("✅ Model saved successfully!")


In [None]:
!nvidia-smi


In [None]:
import torch
import torch.onnx
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# ✅ Load the trained student model
student_model = AutoModelForSequenceClassification.from_pretrained("./distilled_model")
student_model.eval()  # Put model in inference mode

# ✅ Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# ✅ Create a dummy input using the tokenizer
dummy_text = "This movie was absolutely fantastic!"  # Example input text
dummy_inputs = tokenizer(dummy_text, return_tensors="pt")  # Convert to tokenized format

# ✅ Export to ONNX
torch.onnx.export(
    student_model,
    (dummy_inputs["input_ids"], dummy_inputs["attention_mask"]),  # Model expects tokenized input
    "optimized_model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}, "logits": {0: "batch_size"}}  # Allow variable batch sizes
)

print("✅ Model successfully converted to ONNX!")



In [None]:
!pip install onnxruntime-gpu


In [None]:
import torch
import torch.nn.utils.prune as prune

# ✅ Make a copy of the student model to apply pruning
pruned_model = student_model  # You can also do `pruned_model = copy.deepcopy(student_model)`

# ✅ Apply L1 unstructured pruning to all Linear layers
for name, module in pruned_model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name="weight", amount=0.5)  # Prunes 50% of weights

print("✅ Pruning applied successfully!")


In [None]:
import time
import torch
import onnxruntime as ort
import numpy as np

# ✅ Ensure correct device for non-quantized models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cpu_device = torch.device("cpu")  # ✅ Force CPU for quantized model

print(f"Using device: {device}")

def measure_inference_time(model, input_tensor, model_type="pytorch", use_cpu=False):
    """
    Measures the inference time for a given model (PyTorch or ONNX).

    Args:
    - model: The model to run inference on (PyTorch model or ONNX session).
    - input_tensor: The input data in PyTorch format (tokenized text).
    - model_type: "pytorch" for PyTorch models, "onnx" for ONNX models.
    - use_cpu: Whether to force CPU inference (for quantized models).

    Returns:
    - Inference time in seconds.
    """

    # ✅ Move input tensors to correct device
    if use_cpu:
        input_tensor = {k: v.to(cpu_device) for k, v in input_tensor.items()}  # Force CPU for quantized model
    else:
        input_tensor = {k: v.to(device) for k, v in input_tensor.items()}  # Move to GPU if available

    # ✅ Start timer
    start = time.time()

    if model_type == "pytorch":
        if use_cpu:
            model.to(cpu_device)  # ✅ Ensure model is on CPU for quantized
        else:
            model.to(device)  # ✅ Ensure model is on GPU for other models

        with torch.no_grad():  # Disable gradient computation for faster inference
            _ = model(**input_tensor)  # Run model inference

    elif model_type == "onnx":
        # ✅ Convert input tensors to NumPy format (ONNX Runtime requires NumPy arrays)
        onnx_inputs = {k: v.cpu().numpy() for k, v in input_tensor.items()}
        _ = model.run(None, onnx_inputs)  # Run ONNX inference

    # ✅ End timer
    end = time.time()

    return end - start  # ✅ Return the time taken for inference


# ✅ Load ONNX model for inference
session = ort.InferenceSession("optimized_model.onnx")

# ✅ Create a dummy input for testing inference time
dummy_text = "This movie was absolutely fantastic!"  # Example sentence
dummy_inputs = tokenizer(dummy_text, return_tensors="pt")  # Tokenize text

# ✅ Move dummy input to correct device
dummy_inputs = {k: v.to(device) for k, v in dummy_inputs.items()}

# ✅ Ensure all PyTorch models are on the same device (except quantized)
student_model.to(device)  # ✅ On GPU
pruned_model.to(device)   # ✅ On GPU
quantized_model.to(cpu_device)  # ✅ Force CPU for quantized model

# ✅ Measure and print inference times for each model
print("Original Model Inference Time:", measure_inference_time(student_model, dummy_inputs, "pytorch"))
print("Quantized Model Inference Time (CPU):", measure_inference_time(quantized_model, dummy_inputs, "pytorch", use_cpu=True))
print("Pruned Model Inference Time:", measure_inference_time(pruned_model, dummy_inputs, "pytorch"))
print("ONNX Model Inference Time:", measure_inference_time(session, dummy_inputs, "onnx"))

