In [1]:
# Cell 1: Install dependencies (don’t upgrade CUDA‑linked packages)
!pip install --upgrade transformers datasets evaluate box



In [6]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    default_data_collator,
    set_seed,
)
from transformers.data.metrics.squad_metrics import compute_predictions_logits
import evaluate
import random
import torch
import numpy as np

In [7]:
seed = random.randrange(2**32)
print(f"🔢 Using random seed: {seed}")

# Seed all RNGs
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)  # also seeds Hugging Face’s Trainer internals

🔢 Using random seed: 3097302186


In [8]:
# Cell 3: Config / Args
args = {
    "model_name_or_path":      "SolomonSLee/TINYdistillBert",
    "output_dir":              "./output/squad",
    "max_seq_length":          384,
    "doc_stride":              128,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size":  16,
    "learning_rate":           3e-5,
    "num_train_epochs":        3,
    "logging_steps":           500,
    "save_steps":              500,
    "seed":                    seed,
    "version_2_with_negative": False,  # set True if you switch to SQuAD v2
}



In [9]:
# Cell 4: Prepare the SQuAD dataset
raw_datasets = load_dataset("squad")             # v1.1
valid_examples = raw_datasets["validation"]      # for post‐processing
metric         = evaluate.load("squad")            # EM & F1

tokenizer = AutoTokenizer.from_pretrained(
    args["model_name_or_path"], use_fast=True
)

column_names = raw_datasets["train"].column_names
question_col = "question"
context_col  = "context"
answer_col   = "answers"
pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples[question_col if pad_on_right else context_col],
        examples[context_col  if pad_on_right else question_col],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=args["max_seq_length"],
        stride=args["doc_stride"],
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # Map each tokenized example back to its original example
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    starts, ends = [], []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized.sequence_ids(i)
        sample_idx = sample_mapping[i]
        answers = examples[answer_col][sample_idx]
        if len(answers["answer_start"]) == 0:
            starts.append(cls_index)
            ends.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char   = start_char + len(answers["text"][0])
            # Find token start/end
            token_start = 0
            while sequence_ids[token_start] != (1 if pad_on_right else 0):
                token_start += 1
            token_end = len(input_ids) - 1
            while sequence_ids[token_end] != (1 if pad_on_right else 0):
                token_end -= 1
            # If answer out of span
            if not (offsets[token_start][0] <= start_char and offsets[token_end][1] >= end_char):
                starts.append(cls_index)
                ends.append(cls_index)
            else:
                while token_start < len(offsets) and offsets[token_start][0] <= start_char:
                    token_start += 1
                starts.append(token_start - 1)
                while offsets[token_end][1] >= end_char:
                    token_end -= 1
                ends.append(token_end + 1)
    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    return tokenized

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Cell 5: Tokenize train & validation into features
column_names    = raw_datasets["train"].column_names
train_dataset   = raw_datasets["train"].map(
    prepare_train_features, batched=True, remove_columns=column_names
)
valid_features  = raw_datasets["validation"].map(
    prepare_train_features, batched=True, remove_columns=column_names
)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
# Cell 5: Model, Data Collator, and Trainer Setup
model = AutoModelForQuestionAnswering.from_pretrained(
    args["model_name_or_path"]
)

def compute_metrics(p):
    start_logits, end_logits = p.predictions

    all_preds, _ = compute_predictions_logits(
        examples=valid_examples,        # raw validation examples
        features=valid_features,        # tokenized validation features
        predictions=(start_logits, end_logits),
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=tokenizer.do_lower_case,
        output_prediction_file=None,
        output_nbest_file=None,
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=False,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
    )

    # format them for the metric
    formatted_preds = [
        {"id": k, "prediction_text": v}
        for k, v in all_preds.items()
    ]
    references = [
        {"id": ex["id"], "answers": ex["answers"]}
        for ex in valid_examples
    ]
    results = metric.compute(
        predictions=formatted_preds,
        references=references
    )
    return {"em": results["exact_match"], "f1": results["f1"]}

data_collator = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir=args["output_dir"],
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=args["learning_rate"],
    per_device_train_batch_size=args["per_device_train_batch_size"],
    per_device_eval_batch_size=args["per_device_eval_batch_size"],
    num_train_epochs=args["num_train_epochs"],
    logging_steps=args["logging_steps"],
    seed=args["seed"],
    load_best_model_at_end=True,
    metric_for_best_model="f1",       # or "em"
    overwrite_output_dir=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_features,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SolomonSLee/TINYdistillBert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()
metrics = trainer.evaluate()
print(metrics)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msunnysolomon8880[0m ([33msunnysolomon8880-cornell-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
