# **Load SQuAD dataset**

In [None]:
 #! pip install transformers datasets evaluate
from datasets import load_dataset

squad = load_dataset("squad", split="train[:15000]")

In [None]:
# Split the dataset’s train split into a train and test set with the train_test_split method

squad = squad.train_test_split(test_size=0.2, seed=2)

squad["train"][2]

{'id': '56d5fbbd1c85041400946e95',
 'title': 'Dog',
 'context': 'Most breeds of dog are at most a few hundred years old, having been artificially selected for particular morphologies and behaviors by people for specific functional roles. Through this selective breeding, the dog has developed into hundreds of varied breeds, and shows more behavioral and morphological variation than any other land mammal. For example, height measured to the withers ranges from 15.2 centimetres (6.0 in) in the Chihuahua to about 76 cm (30 in) in the Irish Wolfhound; color varies from white through grays (usually called "blue") to black, and browns from light (tan) to dark ("red" or "chocolate") in a wide variation of patterns; coats can be short or long, coarse-haired to wool-like, straight, curly, or smooth. It is common for most breeds to shed this coat.',
 'question': 'People selected dogs they wanted based on what two things?',
 'answers': {'text': ['particular morphologies and behaviors'],
  'answer_

# **load a GPT-2 tokenizer**

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForQuestionAnswering.from_pretrained("gpt2")

# Set the padding token to '[PAD]'
tokenizer.pad_token = "[PAD]"


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Few preprocessing steps to question answering**

1. Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting truncation="only_second".
2. Next, map the start and end positions of the answer to the original context by setting return_offset_mapping=True.
3. With the mapping in hand, now we can find the start and end tokens of the answer. Use the sequence_ids method to find which part of the offset corresponds to the question and which corresponds to the context.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=256,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # Add a print statement here
    # print(f"Total examples: {len(offset_mapping)}")

    for i, offset in enumerate(offset_mapping):
        # Add another print statement here
        # print(f"Processing example {i + 1}/{len(offset_mapping)}")
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

To apply the preprocessing function over the entire dataset, we use Datasets map function. We can speed up the map function by setting batched=True to process multiple elements of the dataset at once.

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

# **Setup HuggingFace token**

In [None]:
import os

os.environ["HF_HOME"] = "/root/.huggingface"
os.environ["HF_HOME"] += "/token"
os.environ["HF_HOME"] = os.path.join(os.environ["HF_HOME"], "token")

### **Create a batch of examples using DefaultDataCollator.**

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

# **Model Training**

1. Define training hyperparameters in TrainingArguments. The only required parameter is output_dir which specifies where to save your model. We’ll push this model to the Hub by setting push_to_hub=True (we need to be signed in to Hugging Face to upload your model).
2. Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator.
3. Call train() to finetune model.


In [None]:
# !pip install accelerate -U

training_args = TrainingArguments(
    output_dir="/content",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=20,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.217,1.821885
2,1.6758,1.643463
3,1.5157,1.633455


TrainOutput(global_step=2250, training_loss=2.063271236843533, metrics={'train_runtime': 2043.4099, 'train_samples_per_second': 17.618, 'train_steps_per_second': 1.101, 'total_flos': 4703341621248000.0, 'train_loss': 2.063271236843533, 'epoch': 3.0})

# **Evaluate Model**

In [None]:
# Define your data collator
data_collator = DefaultDataCollator()

# Define evaluation arguments
evaluation_args = TrainingArguments(
    per_device_eval_batch_size=16,  # Adjust batch size for evaluation if needed
    output_dir="./evaluation_results",  # Specify an output directory for evaluation results
)

# Create a Trainer for evaluation
eval_trainer = Trainer(
    model=model,
    args=evaluation_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Evaluate the model on the test dataset
eval_results = eval_trainer.evaluate(tokenized_squad["test"])

# Print the evaluation results
print(eval_results)

{'eval_loss': 1.6334553956985474, 'eval_runtime': 49.4154, 'eval_samples_per_second': 60.71, 'eval_steps_per_second': 3.804}


In [None]:
squad['test'][3]

{'id': '56ddde4d66d3e219004dad54',
 'title': 'Dutch_Republic',
 'context': 'The Republic of the United Provinces lasted until a series of republican revolutions in 1783–1795 created the Batavian Republic. During this period, republican forces took several major cities of the Netherlands. After initially fleeing, the monarchist forces came back with British, Austrian, and Prussian troops and retook the Netherlands. The republican forces fled to France, but then successfully re-invaded alongside the army of the French republic. After the French Republic became the French Empire under Napoleon, the Batavian Republic was replaced by the Napoleonic Kingdom of Holland.',
 'question': 'The republican forces fled to which country?',
 'answers': {'text': ['France'], 'answer_start': [372]}}

# **Predict Answer**

In [None]:
context = squad['test'][3]['context']
question = squad['test'][3]['question']

# Tokenize the question and context
inputs = tokenizer(question, context, return_tensors="pt")

# Move the inputs to the same device as the model
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Generate predictions
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# Find the answer span
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits)

# Convert indices to Python integers
start_idx = start_idx.item()
end_idx = end_idx.item()

# Tokenize the context and extract the answer span
context_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
answer_tokens = context_tokens[start_idx:end_idx + 1]
answer = tokenizer.convert_tokens_to_string(answer_tokens)

print(answer)


 France


# **Calculate F1 and Match Exact Score**

In [None]:
# Load the evaluation dataset
eval_dataset = squad["test"]

# Check if a GPU is available and use it if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize variables to store scores
total_em = 0
total_f1 = 0
total_examples = 0


def calculate_em_f1(predicted, ground_truth):
    # Calculate EM and F1 scores
    predicted_tokens = set(predicted.lower().split())
    ground_truth_tokens = set(ground_truth.lower().split())
    common_tokens = predicted_tokens.intersection(ground_truth_tokens)

    if len(predicted_tokens) == 0 or len(ground_truth_tokens) == 0:
        f1_score = 0
    else:
        precision = len(common_tokens) / len(predicted_tokens)
        recall = len(common_tokens) / len(ground_truth_tokens)
        if precision + recall == 0:
            f1_score = 0
        else:
            f1_score = (2 * precision * recall) / (precision + recall)

    exact_match = int((predicted.strip()).lower() == (ground_truth.strip()).lower())

    return exact_match, f1_score

# Loop through the evaluation dataset
for example in eval_dataset:
    # Get the context and question from the example
    context = example["context"]
    question = example["question"]

    # Get the list of answers from the example
    answers = example.get("answers", [{"text": [""]}])  # Default to an empty list

    # Inside the loop, move inputs and outputs to the same device
    inputs = tokenizer(question, context, return_tensors="pt").to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    # Find the answer span
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits)

    # Convert indices to Python integers
    start_idx = start_idx.item()
    end_idx = end_idx.item()

    # Tokenize the context and extract the predicted answer span
    context_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    predicted_answer_tokens = context_tokens[start_idx:end_idx + 1]
    predicted_answer = tokenizer.convert_tokens_to_string(predicted_answer_tokens)

    # Get the ground truth answer (it's a list, so we'll choose the first answer)
    ground_truth_answer = answers["text"][0]

    em, f1 = calculate_em_f1(predicted_answer, ground_truth_answer)

    total_em += em
    total_f1 += f1
    total_examples += 1

# Calculate average EM and F1 scores
average_em = total_em / total_examples if total_examples > 0 else 0
average_f1 = total_f1 / total_examples if total_examples > 0 else 0

print("Average Exact Match (EM) Score:", average_em)
print("Average F1 Score:", average_f1)

Average Exact Match (EM) Score: 0.43333333333333335
Average F1 Score: 0.5790257082624083


# **Save Model and Tokenizer**

In [None]:
trainer.save_model("QA_finetuned_model")

In [None]:
tokenizer.save_pretrained("QA_tokenizer")

('QA_tokenizer/tokenizer_config.json',
 'QA_tokenizer/special_tokens_map.json',
 'QA_tokenizer/vocab.json',
 'QA_tokenizer/merges.txt',
 'QA_tokenizer/added_tokens.json',
 'QA_tokenizer/tokenizer.json')

# **Share model in Hugging Face Hub**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os

os.environ["HUGGINGFACE_TOKEN"] = ""

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("/content/QA_finetuned_model")

model.push_to_hub("tanzeelabbas/GPT-2_fine-tuned_squad_2.0_QA")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tanzeelabbas/GPT-2_fine-tuned_squad_2.0_QA/commit/b94d83bf6247cace8e722db43f5f25d3dde01859', commit_message='Upload GPT2ForQuestionAnswering', commit_description='', oid='b94d83bf6247cace8e722db43f5f25d3dde01859', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer=AutoTokenizer.from_pretrained("/content/QA_tokenizer")

tokenizer.push_to_hub("tanzeelabbas/GPT-2_fine-tuned_squad_2.0_QA")

CommitInfo(commit_url='https://huggingface.co/tanzeelabbas/GPT-2_fine-tuned_squad_2.0_QA/commit/c2fa45c290093fbb8ab2064ce5b583a34ccc50ec', commit_message='Upload tokenizer', commit_description='', oid='c2fa45c290093fbb8ab2064ce5b583a34ccc50ec', pr_url=None, pr_revision=None, pr_num=None)

# **Use model**

In [None]:
from transformers import pipeline

QA= pipeline('question-answering', "tanzeelabbas/GPT-2_fine-tuned_squad_2.0_QA")

QA  (question = "The republican forces fled to which country?",
      context = """ The Republic of the United Provinces lasted until a series of republican revolutions
      in 1783–1795 created the Batavian Republic. During this period, republican forces took several
      major cities of the Netherlands. After initially fleeing, the monarchist forces came back with
      British, Austrian, and Prussian troops and retook the Netherlands. The republican forces fled to
      France, but then successfully re-invaded alongside the army of the French republic.
      After the French Republic became the French Empire under Napoleon, the Batavian Republic
      was replaced by the Napoleonic Kingdom of Holland."""
)

{'score': 0.9268386363983154, 'start': 400, 'end': 407, 'answer': ' France'}