User Manual:

1. Make sure to restart the session first.
2. All the glue Tasks are in the glue_tasks array but since we don't have enough compute time we have to split up the glue_tasks.
3. Put the tasks that you are finetuning on in test_tasks array
4. Run all the cells and report your eval score in the shared google sheet

In [None]:
# Cell 1: Install dependencies (don’t upgrade CUDA‑linked packages)
!pip install transformers datasets evaluate box

In [1]:
import random
import numpy as np
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)
import numpy as np
import pandas as pd

In [2]:
seed = random.randrange(2**32)
print(f"🔢 Using random seed: {seed}")

# Seed all RNGs
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)  # also seeds Hugging Face’s Trainer internals

🔢 Using random seed: 1399827640


In [13]:
glue_tasks = [
    "cola", "sst2", "mrpc", "qqp", "mnli",
    "qnli", "rte", "stsb", "wnli"
]

base_args = {
    "model_name_or_path":          "SolomonSLee/TINYdistillBert",
    "max_seq_length":              128,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size":  64,
    "learning_rate":               2e-5,
    "num_train_epochs":            3,
    "logging_steps":               50,
    "weight_decay":                0.01,
    "save_steps":                  500,
    "output_dir":                  "./glue-results",  # subfolders per task
}

In [14]:
all_results = {}
best_metrics = {
    "cola": "loss",
    "sst2": "accuracy",
    "mrpc": "f1",
    "qqp": "f1",
    "mnli": "accuracy",
    "qnli": "accuracy",
    "rte": "accuracy",
    "wnli": "accuracy",
    "stsb": "pearson",
}


for task in glue_tasks:
    print(f"\n===== TASK: {task.upper()} =====")
    args = base_args.copy()
    args["task_name"]  = task
    args["output_dir"] = f"{base_args['output_dir']}/{task}"

    # 1) Load data & metric
    ds     = load_dataset("glue", args["task_name"])
    metric = evaluate.load("glue", args["task_name"])

    # 2) Tokenizer & collator
    tokenizer     = AutoTokenizer.from_pretrained(args["model_name_or_path"])
    data_collator = DataCollatorWithPadding(tokenizer)

    # 3) Preprocess
    def preprocess_fn(ex):
      # Single‐sentence tasks
      if task in ("sst2", "cola"):
          return tokenizer(
              ex["sentence"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # STS‑B: sentence pair regression
      if task == "stsb":
          return tokenizer(
              ex["sentence1"], ex["sentence2"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # MNLI: "premise" + "hypothesis"
      if task == "mnli":
          return tokenizer(
              ex["premise"], ex["hypothesis"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # QNLI: "question" + "sentence"
      if task == "qnli":
          return tokenizer(
              ex["question"], ex["sentence"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # MRPC: "sentence1" + "sentence2"
      if task == "mrpc":
          return tokenizer(
              ex["sentence1"], ex["sentence2"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # QQP: "question1" + "question2"
      if task == "qqp":
          return tokenizer(
              ex["question1"], ex["question2"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # RTE & WNLI: also "sentence1" + "sentence2"
      if task in ("rte", "wnli"):
          return tokenizer(
              ex["sentence1"], ex["sentence2"],
              truncation=True,
              padding="max_length",
              max_length=args["max_seq_length"]
          )

      # Fallback (shouldn't hit if all tasks are covered)
      raise ValueError(f"Unrecognized GLUE task: {task}")


    encoded = ds.map(preprocess_fn, batched=True)

    # 4) Model
    num_labels = 1 if args["task_name"] == "stsb" else ds["train"].features["label"].num_classes
    model      = AutoModelForSequenceClassification.from_pretrained(
                     args["model_name_or_path"],
                     num_labels=num_labels
                 )

    # 5) TrainingArguments
    metric_name = best_metrics[task]

    training_args = TrainingArguments(
        output_dir=args["output_dir"],
        seed=seed,
        per_device_train_batch_size=args["per_device_train_batch_size"],
        per_device_eval_batch_size=args["per_device_eval_batch_size"],
        learning_rate=args["learning_rate"],
        num_train_epochs=args["num_train_epochs"],
        logging_steps=args["logging_steps"],
        save_steps=args["save_steps"],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model = metric_name,
        overwrite_output_dir=True,
    )

    # 6) Metrics function
    def compute_metrics(p):
        logits, labels = p
        if task == "stsb":
            preds = np.squeeze(logits)
        else:
            preds = np.argmax(logits, axis=-1)
        return metric.compute(predictions=preds, references=labels)

    # 7) Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded["train"],
        eval_dataset=(
            encoded["validation_matched"] if task == "mnli"
            else encoded["validation"]
        ),
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # 8) Train & evaluate
    trainer.train()
    result = trainer.evaluate()
    all_results[task] = result



===== TASK: COLA =====


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SolomonSLee/TINYdistillBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.6073,0.632908,0.0
2,0.6169,0.619562,0.0
3,0.5973,0.624405,0.0
4,0.5565,0.625896,0.125226
5,0.495,0.682273,0.139826
6,0.4545,0.734165,0.146654
7,0.4519,0.733712,0.118035
8,0.4172,0.813882,0.110487
9,0.3871,0.840645,0.09594
10,0.3533,0.891803,0.093079


In [10]:
# Cell 5: Summarize all task results
df = pd.DataFrame(all_results).T
display(df)

Unnamed: 0,eval_loss,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
mnli,0.85218,0.613449,33.6903,291.33,4.571,3.0
