In [1]:
!pip install --upgrade transformers datasets accelerate
!pip install "datasets<3.0.0"

Collecting transformers
  Downloading transformers-4.57.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.2-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.12.0-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/3

In [2]:
import os

from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)

from datasets import load_dataset, load_metric

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [3]:
# =====================================
# Cell 4 - Prepare tokenizer and data
# =====================================

# Create model directory to save to
model_dir = "./models/KDGPT/"
os.makedirs(model_dir, exist_ok=True)

# Define Teacher and Student models
student_id = "gpt2"
teacher_id = "gpt2-medium"

In [4]:
# Check tokenizers produce the same output
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

sample = "Here is our sanity check."

assert teacher_tokenizer(sample) == student_tokenizer(sample), (
    "Tokenizers need to have the same output! "
    f"{teacher_tokenizer(sample)} != {student_tokenizer(sample)}"
)

# We do not need them separately anymore
del teacher_tokenizer
del student_tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

# GPT2 models do not have a pad token by default
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})


In [6]:
# Load dataset
dataset_id = "glue"
dataset_config = "sst2"

dataset = load_dataset(dataset_id, dataset_config)

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [8]:
# ============================
# Cell 3 - Helper functions
# ============================

def process(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, max_length=256
    )
    return tokenized_inputs


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(
        predictions=predictions, references=labels
    )
    return {
        "accuracy": acc["accuracy"],
    }

In [9]:
# Tokenize dataset
tokenized_dataset = dataset.map(process, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

print(tokenized_dataset["test"].features)


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

{'sentence': Value(dtype='string', id=None), 'labels': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [10]:
# create label2id, id2label dicts for nice outputs for the model
labels = tokenized_dataset["train"].select(range(20000)).features["labels"].names

num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [11]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)

        self.alpha = alpha
        self.temperature = temperature

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # place teacher on same device as student
        self._move_model_to_device(self.teacher, self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False,num_items_in_batch=None, **kwargs):
        # compute student output
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss

        # compute teacher output
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)

        # assert size
        assert (
            outputs_student.logits.size() == outputs_teacher.logits.size()
        ), "Teacher and student logits must have the same shape"


        loss_function = nn.KLDivLoss(reduction="batchmean")

        loss_logits = loss_function(
            F.log_softmax(
                outputs_student.logits / self.args.temperature, dim=-1
            ),
            F.softmax(
                outputs_teacher.logits / self.args.temperature, dim=-1
            ),
        ) * (self.args.temperature ** 2)

        loss = (
            self.args.alpha * student_loss
            + (1.0 - self.args.alpha) * loss_logits
        )
        return (loss, outputs_student) if return_outputs else loss







In [12]:
# ====================================
# Cell 5 - Training arguments and models
# ====================================

# Use fp16 only if CUDA is available
use_fp16 = torch.cuda.is_available()

training_args = DistillationTrainingArguments(
    output_dir=model_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=use_fp16,
    learning_rate=6e-5,
    seed=8855,

    eval_steps=500,
    save_steps=500,
    logging_steps=100,

    report_to="none",

    alpha=0.5,
    temperature=4.0,
)



In [13]:
# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define models
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:

# Resize token embeddings to account for the added pad token
teacher_model.resize_token_embeddings(len(tokenizer))
student_model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [16]:
# Set pad token id in configs
teacher_model.config.pad_token_id = tokenizer.pad_token_id
student_model.config.pad_token_id = tokenizer.pad_token_id

# define metrics and metrics function
accuracy_metric = load_metric("accuracy")

  accuracy_metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [17]:
# ==========================
# Cell 6 - Train and save
# ==========================

trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_dataset["train"].select(range(20000)),
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



  super().__init__(*args, **kwargs)


In [18]:
trainer.train()

trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"Model and tokenizer saved to: {model_dir}")

Step,Training Loss
100,2.3429
200,1.3197
300,1.4036
400,1.2089
500,1.5385
600,1.3217
700,1.3921
800,1.2285
900,1.1301
1000,1.3375


Model and tokenizer saved to: ./models/KDGPT/


In [19]:
# ====================================
# Cell - Helper for side by side tests
# ====================================

import torch
import numpy as np

# Reuse the label names from earlier
label_names = labels

def compare_on_sentence(sentence):
    # Tokenize
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True,
    )

    # Move to same device as models
    device = next(student_model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        student_outputs = student_model(**inputs)
        teacher_outputs = teacher_model(**inputs)

    student_probs = torch.softmax(student_outputs.logits, dim=-1)[0].cpu().numpy()
    teacher_probs = torch.softmax(teacher_outputs.logits, dim=-1)[0].cpu().numpy()

    student_pred = int(np.argmax(student_probs))
    teacher_pred = int(np.argmax(teacher_probs))

    print("=" * 80)
    print(f"Sentence: {sentence}\n")

    print("Teacher model:")
    print(f"  Predicted label: {label_names[teacher_pred]}")
    print(f"  Probabilities: {[round(p, 3) for p in teacher_probs]}")

    print("\nStudent model:")
    print(f"  Predicted label: {label_names[student_pred]}")
    print(f"  Probabilities: {[round(p, 3) for p in student_probs]}")


In [20]:
# ====================================
# Cell - Try a few custom examples AFTER
# ====================================

compare_on_sentence("This movie was fantastic, I loved every minute of it.")
compare_on_sentence("The plot was boring and the acting was terrible.")
compare_on_sentence("It was okay, not great but not awful either.")
compare_on_sentence("One of the worst films I have ever seen.")
compare_on_sentence("Absolutely brilliant and very emotional.")


Sentence: This movie was fantastic, I loved every minute of it.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.001), np.float32(0.999)]

Student model:
  Predicted label: positive
  Probabilities: [np.float32(0.005), np.float32(0.995)]
Sentence: The plot was boring and the acting was terrible.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.0), np.float32(1.0)]

Student model:
  Predicted label: positive
  Probabilities: [np.float32(0.158), np.float32(0.842)]
Sentence: It was okay, not great but not awful either.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.0), np.float32(1.0)]

Student model:
  Predicted label: positive
  Probabilities: [np.float32(0.006), np.float32(0.994)]
Sentence: One of the worst films I have ever seen.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.0), np.float32(1.0)]

Student model:
  Predicted label: positive
  Probabilities: [np.float32(0.147), np

In [None]:
# ====================================
# Cell - Try a few custom examples Before
# ====================================

compare_on_sentence("This movie was fantastic, I loved every minute of it.")
compare_on_sentence("The plot was boring and the acting was terrible.")
compare_on_sentence("It was okay, not great but not awful either.")
compare_on_sentence("One of the worst films I have ever seen.")
compare_on_sentence("Absolutely brilliant and very emotional.")


Sentence: This movie was fantastic, I loved every minute of it.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.419), np.float32(0.581)]

Student model:
  Predicted label: negative
  Probabilities: [np.float32(0.994), np.float32(0.006)]
Sentence: The plot was boring and the acting was terrible.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.253), np.float32(0.747)]

Student model:
  Predicted label: negative
  Probabilities: [np.float32(0.991), np.float32(0.009)]
Sentence: It was okay, not great but not awful either.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.298), np.float32(0.702)]

Student model:
  Predicted label: negative
  Probabilities: [np.float32(0.977), np.float32(0.023)]
Sentence: One of the worst films I have ever seen.

Teacher model:
  Predicted label: positive
  Probabilities: [np.float32(0.199), np.float32(0.801)]

Student model:
  Predicted label: negative
  Probabilities: [np.float3