In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
import os
import json
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, pipeline
import torch

os.environ["WANDB_DISABLED"] = "true"

dev_data_path = "/kaggle/input/stanfordquestionansweringdataset/dev-v1.1.json"
with open(dev_data_path, 'r') as f:
    dev_data = json.load(f)

train_data_path = "/kaggle/input/questionanswering/train-v1.1.json" 
with open(train_data_path, 'r') as f:
    train_data = json.load(f)

def prepare_data(data):
    contexts = []
    questions = []
    answers = []
    
    for item in data['data']:
        for para in item['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                if len(qa['answers']) > 0:
                    answer = qa['answers'][0]
                    contexts.append(context)
                    questions.append(question)
                    answers.append({
                        'text': answer['text'],
                        'answer_start': answer['answer_start']
                    })
    return Dataset.from_dict({
        'context': contexts,
        'question': questions,
        'answers': answers
    })

train_dataset = prepare_data(train_data)
dev_dataset = prepare_data(dev_data)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        answer = examples["answers"][i]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Find the start and end token positions
        token_start = 0
        token_end = 0
        for idx, (start, end) in enumerate(offsets):
            if sequence_ids[idx] == 1:
                if start <= start_char < end:
                    token_start = idx
                if start < end_char <= end:
                    token_end = idx

        start_positions.append(token_start)
        end_positions.append(token_end)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    tokenized_inputs.pop("offset_mapping")  

    return tokenized_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
dev_dataset = dev_dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained("./question_answering_model")
tokenizer.save_pretrained("./question_answering_model")

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

question = "Who won Super Bowl 50?"
context = "Denver Broncos defeated the Carolina Panthers 24-10."
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,1.0785,1.03683


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Answer: Denver Broncos


In [31]:
question = "Who did the Denver Broncos defeat in Super Bowl 50?"
context = "Denver Broncos defeated the Carolina Panthers 24-10."
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")

Answer: Carolina Panthers
