In [1]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00

## Load Pre-trained BERT Model and Tokenizer

In [2]:
from transformers import BertTokenizer, BertForQuestionAnswering

# Load the pre-trained BERT tokenizer and model for QA
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Input Context and Question

In [3]:
context = """
BERT, which stands for Bidirectional Encoder Representations from Transformers,
is a transformer-based machine learning model for NLP tasks. It was developed by
Google and open-sourced in 2018. BERT has achieved state-of-the-art performance
on various NLP benchmarks.
"""

question = "Who developed BERT?"

## Tokenize the Input Data

In [4]:
inputs = tokenizer(
    question, context,
    max_length=512, truncation=True, padding="max_length",
    return_tensors="pt"
)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

## Predictions with the Model

In [6]:
import torch
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Get the start and end logits (scores for each token's start and end positions)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

## Extract the Answer from the Tokenized Output
### We need to find the tokens with the highest start and end logits.



In [7]:
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits) + 1  # Add 1 to include the last token

# Decode the answer from token IDs
answer = tokenizer.decode(input_ids[0][start_index:end_index])
print(f"Answer: {answer}")

Answer: google


##  Fine-tuning on Custom QA Dataset

In [8]:
from datasets import load_dataset

squad = load_dataset("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Tokenize the Data with Answer Positions

In [13]:
from transformers import BertTokenizerFast

# Load a fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def add_token_positions(examples):
    input_ids_list = []
    attention_mask_list = []
    start_positions_list = []
    end_positions_list = []

    for i in range(len(examples["context"])):
        # Tokenize the question and context together
        inputs = tokenizer(
            examples["question"][i], examples["context"][i],
            max_length=512, truncation=True, padding="max_length", return_offsets_mapping=True
        )

        # Extract the start and end character positions of the answer
        start_char = examples["answers"][i]["answer_start"][0]
        end_char = start_char + len(examples["answers"][i]["text"][0])

        # Get the offsets from the tokenization
        offsets = inputs["offset_mapping"]

        # Find the token indices corresponding to the answer span
        token_start = token_end = None
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                token_start = idx
            if start < end_char <= end:
                token_end = idx
                break

        # Handle cases where the answer span is not fully captured
        if token_start is None:
            token_start = tokenizer.model_max_length - 1
        if token_end is None:
            token_end = tokenizer.model_max_length - 1

        # Append data to lists
        input_ids_list.append(inputs["input_ids"])
        attention_mask_list.append(inputs["attention_mask"])
        start_positions_list.append(token_start)
        end_positions_list.append(token_end)

    # Return the batch as a dictionary
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "start_positions": start_positions_list,
        "end_positions": end_positions_list,
    }

# Apply the tokenization function to the dataset
train_data = squad["train"].map(add_token_positions, batched=True, remove_columns=["id", "title"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

## DataLoader for Training

In [16]:
import torch
from torch.utils.data import DataLoader

# Custom collate function to convert lists to tensors
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    start_positions = torch.tensor([item["start_positions"] for item in batch], dtype=torch.long)
    end_positions = torch.tensor([item["end_positions"] for item in batch], dtype=torch.long)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "start_positions": start_positions,
        "end_positions": end_positions,
    }

# Create DataLoader with collate_fn
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

## Fine-tuning the BERT Model

In [None]:
from transformers import AdamW  # Or switch to torch.optim.AdamW if you prefer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use the new torch.optim.AdamW if you prefer to avoid warnings
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Train for 3 epochs
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move input tensors to the device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_pos = batch["start_positions"].to(device)
        end_pos = batch["end_positions"].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_pos,
            end_positions=end_pos,
        )

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_dataloader):.4f}")

## Ensures that start and end positions are correctly defined for fine-tuning.