In [None]:
# Install necessary libraries from Hugging Face and PyTorch
!pip install transformers datasets accelerate bitsandbytes torch



In [None]:
# Log in to your Hugging Face account
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from datasets import load_dataset

In [None]:
# --- 1. Load Tokenizer and Dataset ---
student_model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(student_model_name)

# Load the IMDB dataset
imdb = load_dataset("imdb")

# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# CORRECT ORDER: First, tokenize the entire dataset
tokenized_imdb = imdb.map(preprocess_function, batched=True)

# THEN, create your smaller subsets from the tokenized dataset
train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(10000))
eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(1000))

print("Dataset loaded and tokenized.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Dataset loaded and tokenized.


In [None]:
# --- 2. Load Student Model ---
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_model_name,
    num_labels=2 # Positive or Negative sentiment
)
print("Student model loaded.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Student model loaded.


In [None]:
# --- 3. Load Teacher Model ---
teacher_model_name = "google/gemma-3-4b-it"

# Pro-Tip: Load the large teacher model in 4-bit to make it fit in Colab's memory.
# This is a practical application of quantization!
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name,
    num_labels=2,
    quantization_config=quantization_config,
    device_map="auto" # Automatically map model layers to available devices (GPU)
)
print("Teacher model loaded in 4-bit precision.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Gemma3ForSequenceClassification were not initialized from the model checkpoint at google/gemma-3-4b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Teacher model loaded in 4-bit precision.


In [None]:
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
import torch

# This is the core of your project's logic, where you customize the training process.
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # Move teacher to the same device as the student model
        if self.teacher is not None:
            self.teacher.to(self.model.device)

    # CORRECTED LINE: Added **kwargs to accept any extra arguments
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # --- Standard Student Loss ---
        # Get the student's own predictions and calculate the loss against the true labels.
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss

        # --- Distillation Loss ---
        # Get the teacher's predictions (logits). No gradient needed for the teacher.
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)

        # Define distillation parameters (as mentioned in your survey paper)
        alpha = 0.5      # Balances the two loss components
        temperature = 2.0  # Softens the probability distributions for better knowledge transfer

        # Calculate the distillation loss between teacher and student "soft targets"
        distillation_loss = F.kl_div(
            input=F.log_softmax(outputs_student.logits / temperature, dim=-1),
            target=F.softmax(outputs_teacher.logits / temperature, dim=-1),
            reduction="batchmean"
        ) * (temperature ** 2)

        # Calculate the final combined loss as a weighted sum
        loss = alpha * student_loss + (1.0 - alpha) * distillation_loss
        return (loss, outputs_student) if return_outputs else loss

# --- Define Training Arguments for a quick proof-of-concept run ---
training_args = TrainingArguments(
    output_dir="distilled_model_checkpoint",
    num_train_epochs=1,  # Train for only 1 epoch for this test
    per_device_train_batch_size=4, # Use a smaller batch size to avoid memory issues
    per_device_eval_batch_size=4,
    fp16=True, # Use mixed precision for speed
    logging_steps=50,
    save_strategy="epoch",
    # We use max_steps to limit the training run for a fast result
    max_steps=200
)

# --- Instantiate and Run the Trainer ---
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    # CORRECTED LINES: Use the variables you created earlier
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    teacher_model=teacher_model
)

print("Starting distillation training...")
trainer.train()
print("Proof-of-concept training complete.")

  super().__init__(*args, **kwargs)


Starting distillation training...


[34m[1mwandb[0m: Currently logged in as: [33mkarthikeyanrv[0m ([33mgovernment-college-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.0
100,0.0
150,0.0


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0


Proof-of-concept training complete.


In [None]:
!pip install evaluate



In [None]:
import numpy as np
import evaluate  # The new Hugging Face library for metrics

# 1. Define the metric we want to use (accuracy)
metric = evaluate.load("accuracy")

# 2. Create a function that the Trainer will call to compute the metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# --- Instantiate and Run the Trainer (with the new argument) ---
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    teacher_model=teacher_model,
    compute_metrics=compute_metrics  # <-- ADD THIS LINE
)

# --- You can now re-run the evaluation ---
print("Evaluating the distilled student model...")
evaluation_results = trainer.evaluate()

# This will now work correctly
print("\n--- Evaluation Results ---")
print(f"Accuracy: {evaluation_results['eval_accuracy']:.4f}")
print(f"Loss: {evaluation_results['eval_loss']:.4f}")

Downloading builder script: 0.00B [00:00, ?B/s]

  super().__init__(*args, **kwargs)


Evaluating the distilled student model...



--- Evaluation Results ---
Accuracy: 0.5150
Loss: nan


In [None]:
# Force an upgrade of all necessary libraries to their latest versions
!pip install -U transformers datasets accelerate bitsandbytes torch



In [None]:
from transformers import TrainingArguments

# --- Define Training Arguments for a more STABLE and effective run ---
training_args = TrainingArguments(
    output_dir="distilled_model_final_checkpoint",
    num_train_epochs=3,          # Train for 3 full epochs to allow for better learning
    learning_rate=2e-5,          # A smaller, more stable learning rate is standard for fine-tuning
    per_device_train_batch_size=8,   # Slightly larger batch size if memory allows
    per_device_eval_batch_size=8,
    weight_decay=0.01,           # A standard regularization technique to prevent overfitting
    fp16=True,
    logging_strategy="epoch",    # Log metrics at the end of each epoch
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True, # Automatically load the best performing model at the end
    report_to="wandb"            # Log results to Weights & Biases
)

# --- Re-instantiate your Trainer with the new, better arguments ---
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    teacher_model=teacher_model,
    compute_metrics=compute_metrics
)

# --- Start the full training run ---
print("Starting full distillation training...")
trainer.train()
print("Full training complete.")

  super().__init__(*args, **kwargs)


Starting full distillation training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,,0.515


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,,0.515
2,0.0,,0.515
3,0.0,,0.515


Full training complete.


In [None]:
from transformers import TrainingArguments

# --- Define Training Arguments with Gradient Clipping for STABILITY ---
training_args = TrainingArguments(
    output_dir="distilled_model_final_checkpoint",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    fp16=True,
    # --- ADD THIS LINE FOR STABILITY ---
    max_grad_norm=1.0,  # This is the gradient clipping parameter
    # ------------------------------------
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb"
)

# --- Re-instantiate your Trainer with the new arguments ---
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    teacher_model=teacher_model,
    compute_metrics=compute_metrics
)

# --- Start the full training run ---
print("Starting full distillation training with gradient clipping...")
trainer.train()
print("Full training complete.")

NameError: name 'DistillationTrainer' is not defined