Data Collection
---

In [15]:
import json
import requests
from transformers import BertTokenizer

In [16]:
# Function to download the SQuAD dataset
def download_squad_data():
    #url = "https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset/train-v1.1.json"
    url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"  # SQuAD 1.1 URL
    response = requests.get(url)
    squad_data = response.json()
    return squad_data

# Function to preprocess the SQuAD data and create QA pairs
def preprocess_squad_data(squad_data, num_examples):  # You can adjust the number of examples as needed
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
    qa_pairs = []
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qas in paragraph["qas"]:
                question = qas["question"]
                if qas["answers"]:
                    answer_text = qas["answers"][0]["text"]  # We'll consider only the first answer
                    answer_start = qas["answers"][0]["answer_start"]
                    answer_end = answer_start + len(answer_text)
                else:
                    # If there are no answers, set them to -1
                    answer_text = ""
                    answer_start = -1
                    answer_end = -1
                
                # Tokenize the context and question
                inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt", max_length=750,truncation=True)
                input_ids = inputs["input_ids"].flatten()
                attention_mask = inputs["attention_mask"].flatten()
                
                qa_pairs.append({
                    "context": context,
                    "question": question,
                    "answer_text": answer_text,
                    "answer_start": answer_start,
                    "answer_end": answer_end,
                    "input_ids": input_ids.tolist(),
                    "attention_mask": attention_mask.tolist()
                })
                
                # Stop after processing the desired number of examples
                if len(qa_pairs) >= num_examples:
                    return qa_pairs
    
    return qa_pairs

# Download the SQuAD data
squad_data = download_squad_data()

# Preprocess a smaller subset of the SQuAD data and create QA pairs
num_examples = 5000  # Change this number to control the size of the dataset
qa_pairs = preprocess_squad_data(squad_data, num_examples)

# Optional: Save the preprocessed QA pairs to a JSON file
with open("squad_lite_qa_pairs.json", "w", encoding="utf-8") as f:
    json.dump(qa_pairs, f, ensure_ascii=False, indent=2)


Tokenization data
---

In [17]:
# Step 2: Tokenization using BERT tokenizer
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Tokenize the QA pairs for model input
tokenized_qa_pairs = []
max_length = 750 # Set the maximum sequence length

for qa_pair in qa_pairs:
    # Tokenize context and question separately
    tokenized_context = tokenizer.encode(qa_pair["context"], add_special_tokens=False,max_length=750,truncation=True)
    tokenized_question = tokenizer.encode(qa_pair["question"], add_special_tokens=False,max_length=750,truncation=True)

    # Combine context and question tokens with [SEP] token in between
    input_ids = tokenized_context + [tokenizer.sep_token_id] + tokenized_question

    # Create attention mask where 1's indicate tokens and 0's indicate padding
    attention_mask = [1] * len(input_ids)

    # Ensure the input sequence is within model's maximum length
    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]

    # Padding
    padding_length = max_length - len(input_ids)
    input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
    attention_mask = attention_mask + [0] * padding_length

    # Record the start and end position of the answer within the tokenized input
    answer_start = min(qa_pair["answer_start"], max_length - 1)
    answer_end = min(qa_pair["answer_end"], max_length - 1)

    tokenized_qa_pairs.append({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "answer_start": answer_start,
        "answer_end": answer_end
    })

Fine-Tuning Model
---

In [18]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm


In [20]:

# Step 3: Fine-tuning the Model
 
# Define a custom dataset class for tokenized QA pairs
class QADataset(Dataset):
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.qa_pairs[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.qa_pairs[idx]["attention_mask"]),
            "start_positions": torch.tensor(self.qa_pairs[idx]["answer_start"]),
            "end_positions": torch.tensor(self.qa_pairs[idx]["answer_end"])
        }
    

# Prepare the fine-tuning data
train_size = int(0.8 * len(tokenized_qa_pairs))
train_dataset = QADataset(tokenized_qa_pairs[:train_size])  # tokenized_qa_pairs from Step 2
val_dataset = QADataset(tokenized_qa_pairs[train_size:])


# Define batch size and create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [21]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [6]:
# Load the pre-trained BERT model for Question Answering and move it to GPU
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 10  # You can change the number of epochs (here 3)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning loop
for epoch in range(6):  
    model.train()
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_qa_model")
tokenizer.save_pretrained("fine_tuned_qa_model")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:12:51<00:00,  8.74s/it]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:11:38<00:00,  8.60s/it]
Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:11:32<00:00,  8.59s/it]
Epoch 4: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:13:30<00:00,  8.82s/it]
Epoch 5: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:11:24<00:00,  8.57s/it]
Epoch 6: 100%|█████████████████████████████████████████████████████████████████████| 500/500 [1:11:17<00:00,  8.

('fine_tuned_qa_model\\tokenizer_config.json',
 'fine_tuned_qa_model\\special_tokens_map.json',
 'fine_tuned_qa_model\\vocab.txt',
 'fine_tuned_qa_model\\added_tokens.json')

Evaluation and Hyperparameter
===

In [7]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load the fine-tuned model and tokenizer
model_path = "fine_tuned_qa_model"
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Function to evaluate the model on the validation set
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_positions"].to(device)
            end_positions = batch["end_positions"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    return average_loss

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the validation dataset
val_dataset = QADataset(tokenized_qa_pairs[train_size:])
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Evaluate the model on the validation set
validation_loss = evaluate_model(model, val_dataloader)

print("Validation Loss:", validation_loss)


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 125/125 [12:40<00:00,  6.09s/it]

Validation Loss: 5.509262018203735





In [1]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer


# Load the fine-tuned BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained("fine_tuned_qa_model")
tokenizer = BertTokenizer.from_pretrained("fine_tuned_qa_model")

# Function to get an answer from the model
def get_answer(context, question):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)

    start_index = torch.argmax(start_probs)
    end_index = torch.argmax(end_probs)

    # Handle cases where the end index is before the start index
    if end_index < start_index:
        start_index, end_index = end_index, start_index

    answer_tokens = input_ids[0, start_index:end_index+1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

# Example usage
context = "Mount Everest is the highest peak in the world, with an elevation of 8,848 meters (29,029 feet). It is located in the Himalayas."
question = "What is the elevation of Mount Everest?"
answer = get_answer(context, question)
print("Answer:", answer)


  from .autonotebook import tqdm as notebook_tqdm


Answer: what is the elevation of mount everest? mount everest is the highest peak in the world, with an elevation of 8, 848 meters ( 29, 029
