In [None]:
!pip install transformers datasets accelerate -U
!pip install evaluate

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import evaluate
import numpy as np

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

raw_datasets = load_dataset("squad", split="train[:5000]")
raw_datasets = raw_datasets.train_test_split(test_size=0.1)

print(f"Loaded {len(raw_datasets['train'])} training examples.")

In [None]:
MAX_LENGTH = 384
STRIDE = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_map[i]
        answer = examples["answers"][sample_index]
        context = examples["context"][sample_index]

        if len(answer['text']) == 0:
            inputs["start_positions"].append(0)
            inputs["end_positions"].append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        context_start_token = sequence_ids.index(1)
        context_end_token = len(sequence_ids) - 1

        start_token = context_start_token
        while start_token < context_end_token and offsets[start_token][0] <= start_char:
            start_token += 1
        inputs["start_positions"].append(start_token - 1)

        end_token = context_end_token
        while end_token >= context_start_token and offsets[end_token][1] >= end_char:
            end_token -= 1
        inputs["end_positions"].append(end_token + 1)

    return inputs

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

In [None]:
from transformers import TrainingArguments, Trainer, default_data_collator
import torch

use_fp16 = False

training_args = TrainingArguments(
    output_dir="./qa_bert_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=use_fp16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"].select(range(500)),
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

print("\n" + "="*50)
print("  Starting BERT Q&A Fine-Tuning")
print("="*50 + "\n")

trainer.train()

print("\nFine-Tuning Complete! Model is now saved.")
trainer.save_model("./final_bert_qa_model")
tokenizer.save_pretrained("./final_bert_qa_model")

In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

MODEL_PATH = "./final_bert_qa_model"
qa_pipeline = pipeline(
    "question-answering",
    model=AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH),
    tokenizer=AutoTokenizer.from_pretrained(MODEL_PATH)
)

def answer_question_from_pdf(question, context):
    if not context:
        return "Error: Context (PDF text) is empty."

    result = qa_pipeline({
        'question': question,
        'context': context
    })

    print("\n--- Q&A Result ---")
    print(f"Question: {question}")
    print(f"Predicted Answer: {result['answer']}")
    print(f"Confidence Score: {result['score']:.4f}")
    print("------------------")

    return result['answer']

sample_context = """
BERT was invented by Google and uses a Transformer architecture. The model
fine-tuned here was only trained for a short period of 3 epochs to conserve
time, but is now ready for demonstration.
"""
q_test = "How long was the model trained for?"
answer_question_from_pdf(q_test, sample_context)