In [None]:
!pip install transformers datasets evaluate box

In [None]:
import random
import numpy as np
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import time

In [None]:
seed = random.randrange(2**32)
print(f"🔢 Using random seed: {seed}")

# Seed all RNGs
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)  # also seeds Hugging Face’s Trainer internals

In [None]:
glue_tasks = [
    "stsb"
]

base_args = {
    "model_name_or_path":          "SolomonSLee/TINYdistillBert",
    "max_seq_length":              128,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size":  64,
    "learning_rate":               2e-5,
    "num_train_epochs":            3,
    "logging_steps":               50,
    "weight_decay":                0.01,
    "save_steps":                  500,
    "output_dir":                  "./glue-results",  # subfolders per task
}

In [None]:
all_results = {}
best_metrics = {
    "cola": "loss",
    "sst2": "accuracy",
    "mrpc": "f1",
    "qqp": "f1",
    "mnli": "accuracy",
    "qnli": "accuracy",
    "rte": "accuracy",
    "wnli": "accuracy",
    "stsb": "pearson",
}

model_variants = {
    "TinyDistilBERT":       "SolomonSLee/TINYdistillBert",
    "DistilBERT-base":      "distilbert-base-uncased",
    "BERT-base":            "bert-base-uncased",
}


for model_name, model_path in model_variants.items():
    print(f"\n===== MODEL: {model_name} ({model_path}) =====")
    # update the base_args for this model
    base_args["model_name_or_path"] = model_path

    # storage for this model’s tasks
    all_results[model_name] = {}

    for task in glue_tasks:
        print(f"\n===== TASK: {task.upper()} =====")
        args = base_args.copy()
        args["task_name"]  = task
        args["output_dir"] = f"{base_args['output_dir']}/{model_name}/{task}"

        # 1) Load data & metric
        ds     = load_dataset("glue", args["task_name"])
        metric = evaluate.load("glue", args["task_name"])

        # 2) Tokenizer & collator
        tokenizer     = AutoTokenizer.from_pretrained(args["model_name_or_path"])
        data_collator = DataCollatorWithPadding(tokenizer)

        # 3) Preprocess
        def preprocess_fn(ex):
          # Single‐sentence tasks
          if task in ("sst2", "cola"):
              return tokenizer(
                  ex["sentence"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # STS‑B: sentence pair regression
          if task == "stsb":
              return tokenizer(
                  ex["sentence1"], ex["sentence2"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # MNLI: "premise" + "hypothesis"
          if task == "mnli":
              return tokenizer(
                  ex["premise"], ex["hypothesis"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # QNLI: "question" + "sentence"
          if task == "qnli":
              return tokenizer(
                  ex["question"], ex["sentence"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # MRPC: "sentence1" + "sentence2"
          if task == "mrpc":
              return tokenizer(
                  ex["sentence1"], ex["sentence2"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # QQP: "question1" + "question2"
          if task == "qqp":
              return tokenizer(
                  ex["question1"], ex["question2"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # RTE & WNLI: also "sentence1" + "sentence2"
          if task in ("rte", "wnli"):
              return tokenizer(
                  ex["sentence1"], ex["sentence2"],
                  truncation=True,
                  padding="max_length",
                  max_length=args["max_seq_length"]
              )

          # Fallback (shouldn't hit if all tasks are covered)
          raise ValueError(f"Unrecognized GLUE task: {task}")


        encoded = ds.map(preprocess_fn, batched=True)

        # 4) Model
        num_labels = 1 if args["task_name"] == "stsb" else ds["train"].features["label"].num_classes
        model      = AutoModelForSequenceClassification.from_pretrained(
                        args["model_name_or_path"],
                        num_labels=num_labels
                    )

        # 5) TrainingArguments
        metric_name = best_metrics[task]

        training_args = TrainingArguments(
            output_dir=args["output_dir"],
            seed=seed,
            per_device_train_batch_size=args["per_device_train_batch_size"],
            per_device_eval_batch_size=args["per_device_eval_batch_size"],
            learning_rate=args["learning_rate"],
            num_train_epochs=args["num_train_epochs"],
            logging_steps=args["logging_steps"],
            save_steps=args["save_steps"],
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model = metric_name,
            overwrite_output_dir=True,
        )

        # 6) Metrics function
        def compute_metrics(p):
            logits, labels = p
            if task == "stsb":
                preds = np.squeeze(logits)
            else:
                preds = np.argmax(logits, axis=-1)
            return metric.compute(predictions=preds, references=labels)

        # 7) Trainer setup
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=encoded["train"],
            eval_dataset=(
                encoded["validation_matched"] if task == "mnli"
                else encoded["validation"]
            ),
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        # 8) Train & evaluate
        trainer.train()
        result = trainer.evaluate()
        all_results[task] = result


In [None]:
# 1) Define a dict of "display name" -> model identifier or path
model_variants = {
    "TinyDistilBERT (fine-tuned)": f"{base_args['output_dir']}/TinyDistilBERT/stsb/checkpoint-540",
    "BERT-base-uncased (fine-tuned)":          f"{base_args['output_dir']}/BERT-base/stsb/checkpoint-540",
    "DistilBERT-base (fine-tuned)":                f"{base_args['output_dir']}/DistilBERT-base/stsb/checkpoint-540",
}

torch.set_num_threads(1)

# 2) For each model: load tokenizer + model, build a DataLoader, warm up & time
for name, model_id in model_variants.items():
    print(f"\n=== Timing {name} ===")

    # ---- load tokenizer & model ----
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model     = AutoModelForSequenceClassification.from_pretrained(
                    model_id,
                    num_labels=1
                ).to("cpu").eval()

    # ---- prepare the dataset ----
    # (re-tokenize with the model’s vocab so we get the right padding & IDs)
    raw_val = load_dataset("glue", "stsb")["validation"]
    def tok_fn(ex):
        return tokenizer(
            ex["sentence1"], ex["sentence2"],
            truncation=True, padding="max_length", max_length=128
        )
    encoded = raw_val.map(tok_fn, batched=True)

    # drop text columns, keep only model inputs as torch tensors
    cols = tokenizer.model_input_names
    val_ds = encoded.remove_columns(
        [c for c in encoded.column_names if c not in cols]
    )
    val_ds.set_format(type="torch", columns=cols)

    loader = DataLoader(val_ds, batch_size=1, shuffle=False)

    # ---- warm-up (10 examples) ----
    with torch.no_grad():
        for i, batch in enumerate(loader):
            _ = model(**batch)
            if i >= 10:
                break

    # ---- timed inference ----
    start = time.perf_counter()
    with torch.no_grad():
        for batch in loader:
            _ = model(**batch)
    end = time.perf_counter()

    total = end - start
    per_ex = total / len(loader)
    print(f"Full-pass time: {total:.2f} s")
    print(f"Average / example: {per_ex:.4f} s")