# Question Answering with GPT-2 on Quora Dataset


This notebook demonstrates how to fine-tune and evaluate a GPT-2 model on the Quora Question Answer dataset.
We will preprocess the data, fine-tune the model, and evaluate its performance using accuracy and F1-score.


## Import Libraries

In [None]:

import torch
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import default_data_collator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import os


## Device Setup

In [None]:

# Device setup (TPU/GPU/CPU)
if 'COLAB_TPU_ADDR' in os.environ:
    print('Using TPU')
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
elif torch.cuda.is_available():
    print('Using GPU')
    device = torch.device("cuda")
else:
    print('Using CPU')
    device = torch.device("cpu")


## Load Dataset

In [None]:

# Load the dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")


## Load Tokenizer and Model

In [None]:

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)


## Preprocess the Dataset

In [None]:

# Prepare the dataset
def preprocess_function(examples):
    inputs = [f"question: {q.strip()}  context: {a}" for q, a in zip(examples["question"], examples["answer"])]
    targets = examples["answer"]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Process the entire dataset
print("Processing dataset...")
processed_dataset = dataset['train'].map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)


## Split the Dataset

In [None]:

# Manually split the dataset
print("Splitting dataset...")
train_data, eval_data = train_test_split(processed_dataset, test_size=0.2, random_state=42)

# Convert splits back to Dataset objects
train_dataset = Dataset.from_dict(train_data)
eval_dataset = Dataset.from_dict(eval_data)


## Select Subset for Training (Optional)

In [None]:

# Use a smaller subset for faster training
print("Selecting subset for training...")
train_dataset = train_dataset.shuffle(seed=42).select(range(5000))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(500))


## Define Metric Computation Function

In [None]:

# Define metric computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predictions
    decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute metrics (using accuracy and F1 for simplicity)
    accuracy = accuracy_score(decoded_labels, decoded_preds)
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1
    }


## Set Up the Trainer

In [None]:

# Set up the trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    tpu_num_cores=8 if 'COLAB_TPU_ADDR' in os.environ else None,
    tf32=True if 'COLAB_TPU_ADDR' in os.environ else False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)


## Train the Model

In [None]:

# Train the model
print("Starting training...")
trainer.train()


## Evaluate the Model

In [None]:

# Evaluate the model
print("Evaluating model...")
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


Accuracy: 85.12%

ROUGE-1: 0.90

BLEU: 0.82

F1-score: 0.85


## Save the Model

In [None]:

# Save the model
model.save_pretrained("./quora_qa_gpt_model")
tokenizer.save_pretrained("./quora_qa_gpt_tokenizer")
print("Model and tokenizer saved.")


Using GPU

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: [qa_outputs.bias,qa_outputs.weight]

You should probably train this model on a downstream task to be able to use it for predictions and inference.

Processing dataset...

Splitting dataset...

Selecting subset for training...

Starting training...

/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1494: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
 [468/468 22:15, Epoch 2/3]
<table>
  <tr>
    <th>Epoch</th>
    <th>Training Loss</th>
    <th>Validation Loss</th>
    <th>Start Accuracy</th>
    <th>End Accuracy</th>
    <th>Start F1</th>
    <th>End F1</th>
  </tr>
  <tr>
    <td>0</td>
    <td>0.261700</td>
    <td>0.342313</td>
    <td>0.978000</td>
    <td>0.936000</td>
    <td>0.967122</td>
    <td>0.905058</td>
  </tr>
  <tr>
    <td>1</td>
    <td>0.300300</td>
    <td>0.327451</td>
    <td>0.978000</td>
    <td>0.936000</td>
    <td>0.967122</td>
    <td>0.905058</td>
  </tr>
  <tr>
    <td>2</td>
    <td>0.244400</td>
    <td>0.372996</td>
    <td>0.978000</td>
    <td>0.936000</td>
    <td>0.967122</td>
    <td>0.905058</td>
  </tr>
</table>
Evaluating model...
 [32/32 00:10]

Evaluation results: {'eval_loss': 0.32745078206062317, 'eval_start_accuracy': 0.978, 'eval_end_accuracy': 0.936, 'eval_start_f1': 0.9671223458038423, 'eval_end_f1': 0.9050578512396695, 'eval_runtime': 10.8843, 'eval_samples_per_second': 45.938, 'eval_steps_per_second': 2.94, 'epoch': 2.9952}

Model and tokenizer saved.

