In [1]:
import random
import numpy as np
import torch
from transformers import set_seed

In [2]:
seed = random.randrange(2**32)
print(f"🔢 Using random seed: {seed}")

# Seed all RNGs
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)  # also seeds Hugging Face’s Trainer internals

🔢 Using random seed: 370370239


In [3]:
# Cell 1: Install dependencies (don’t upgrade CUDA‑linked packages)
!pip install transformers datasets evaluate box

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting box
  Downloading box-0.1.5-py3-none-any.whl.metadata (1.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting columnar==1.3.1 (from box)
  Downloading Columnar-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting executing==0.8.2 (from box)
  Downloading executing-0.8.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting loguru (from box)
  

In [8]:
# Cell 2: Imports
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np

In [5]:
# Cell 3: Config / Args
args = {
    "model_name_or_path":      "distilbert-base-uncased",
    "output_dir":              "./output/squad",
    "max_seq_length":          384,
    "doc_stride":              128,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size":  16,
    "learning_rate":           3e-5,
    "num_train_epochs":        3,
    "logging_steps":           500,
    "save_steps":              500,
    "seed":                    seed,
    "version_2_with_negative": False,  # set True if you switch to SQuAD v2
}


In [6]:
# Cell 4: Prepare the SQuAD dataset
raw_datasets = load_dataset("squad")                # v1.1
metric       = evaluate.load("squad")

tokenizer = AutoTokenizer.from_pretrained(
    args["model_name_or_path"], use_fast=True
)

column_names = raw_datasets["train"].column_names
question_col = "question"
context_col  = "context"
answer_col   = "answers"
pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples[question_col if pad_on_right else context_col],
        examples[context_col  if pad_on_right else question_col],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=args["max_seq_length"],
        stride=args["doc_stride"],
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # Map each tokenized example back to its original example
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    starts, ends = [], []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized.sequence_ids(i)
        sample_idx = sample_mapping[i]
        answers = examples[answer_col][sample_idx]
        if len(answers["answer_start"]) == 0:
            starts.append(cls_index)
            ends.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char   = start_char + len(answers["text"][0])
            # Find token start/end
            token_start = 0
            while sequence_ids[token_start] != (1 if pad_on_right else 0):
                token_start += 1
            token_end = len(input_ids) - 1
            while sequence_ids[token_end] != (1 if pad_on_right else 0):
                token_end -= 1
            # If answer out of span
            if not (offsets[token_start][0] <= start_char and offsets[token_end][1] >= end_char):
                starts.append(cls_index)
                ends.append(cls_index)
            else:
                while token_start < len(offsets) and offsets[token_start][0] <= start_char:
                    token_start += 1
                starts.append(token_start - 1)
                while offsets[token_end][1] >= end_char:
                    token_end -= 1
                ends.append(token_end + 1)
    tokenized["start_positions"] = starts
    tokenized["end_positions"]   = ends
    return tokenized

# Tokenize train & validation
train_dataset = raw_datasets["train"].map(
    prepare_train_features, batched=True, remove_columns=column_names
)
eval_dataset  = raw_datasets["validation"].map(
    prepare_train_features, batched=True, remove_columns=column_names
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [10]:
# Cell 5: Model, Data Collator, and Trainer Setup
model = AutoModelForQuestionAnswering.from_pretrained(
    args["model_name_or_path"]
)
data_collator = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir=args["output_dir"],
    seed=args["seed"],
    per_device_train_batch_size=args["per_device_train_batch_size"],
    per_device_eval_batch_size=args["per_device_eval_batch_size"],
    learning_rate=args["learning_rate"],
    num_train_epochs=args["num_train_epochs"],
    logging_steps=args["logging_steps"],
    save_steps=args["save_steps"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # QA uses eval_loss
    overwrite_output_dir=True,
)

def compute_qa_metrics(p):
    start_logits, end_logits = p
    # Use HF helper to align predictions to original contexts
    from transformers import default_data_collator
    # Convert raw logits & features back to answers
    # (you can reuse HF’s postprocess_qa_predictions util if you import it)
    return metric.compute(predictions=preds, references=references)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_qa_metrics,   # optional for span metrics
)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# Cell 6: Train & Evaluate
trainer.train()
metrics = trainer.evaluate()
print(metrics)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msunnysolomon8880[0m ([33msunnysolomon8880-cornell-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
