<a href="https://colab.research.google.com/github/XanimGuliyeva/Question_Answering_Model/blob/main/Copia_di_1_Project_QA_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Presentation Plan: Extractive Question Answering Using BERT, BM25, and DPR

In [None]:
!pip install transformers datasets torch faiss-cpu rank_bm25 sentence-transformers

In [None]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")
print(dataset)


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    start_positions, end_positions = [], []

    for i in range(len(examples["question"])):
        answer = examples["answers"][i]

        if answer["answer_start"] and answer["text"]:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            start_token = inputs.char_to_token(i, start_char) or 0
            end_token = inputs.char_to_token(i, end_char - 1) or 0
        else:
            start_token, end_token = 0, 0

        start_positions.append(start_token)
        end_positions.append(end_token)

    inputs.update({"start_positions": start_positions, "end_positions": end_positions})
    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
# Define dataset sizes
TRAIN_SIZE = 5000
VAL_SIZE = 2000  # Reduce validation size
TEST_SIZE = 800  # New test set

# Select subsets
small_train_dataset = encoded_dataset["train"].select(range(TRAIN_SIZE))
small_val_dataset = encoded_dataset["validation"].select(range(VAL_SIZE))
test_dataset = encoded_dataset["validation"].select(range(VAL_SIZE, VAL_SIZE + TEST_SIZE))

print(f"Train Samples: {len(small_train_dataset)}")
print(f"Validation Samples: {len(small_val_dataset)}")
print(f"Test Samples: {len(test_dataset)}")


In [None]:
print(dataset["train"].column_names)

In [None]:
print(encoded_dataset["train"].column_names)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers.trainer_callback import ProgressCallback
from transformers import EarlyStoppingCallback

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    remove_unused_columns=False,
    logging_dir='./logs',
    logging_steps=50,

    logging_strategy="steps",
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.001,
    max_grad_norm=1.0,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


trainer.train()

In [None]:
import matplotlib.pyplot as plt

epochs = [1, 2, 3]
train_loss = [0.662400, 0.571400, 0.361100]
val_loss = [0.448554, 0.350456, 0.394026]

plt.plot(epochs, train_loss, label="Training Loss", marker="o")
plt.plot(epochs, val_loss, label="Validation Loss", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs. Validation Loss Over Epochs")
plt.show()


In [None]:
from transformers import pipeline

In [None]:
# Save the trained model after training
trainer.save_model("./fine_tuned_bert")

# Load the fine-tuned model for inference
qa_pipeline = pipeline("question-answering", model="./fine_tuned_bert", tokenizer=tokenizer)

# Example Question & Context
context = "The Eiffel Tower is in Paris, France. It was built in 1889."
question = "Where is the Eiffel Tower?"

# Get prediction
result = qa_pipeline(question=question, context=context)

# Print the predicted answer
print(f"Predicted Answer: {result}")

In [None]:
test_results = trainer.evaluate(test_dataset)
print(test_results)

In [None]:
import torch
import faiss
import numpy as np
import nltk
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')

# Sample documents for retrieval
documents = [
    "The Eiffel Tower is in Paris.",
    "The Statue of Liberty is in New York.",
    "Mount Everest is the highest mountain."
]

query = "Where is the Eiffel Tower?"


In [None]:
### === 1. BM25 Retrieval === ###
print("\n🔹 Using BM25")
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

query_tokens = word_tokenize(query.lower())
bm25_scores = bm25.get_scores(query_tokens)

best_bm25_index = np.argmax(bm25_scores)
best_bm25_match = documents[best_bm25_index]

print(f"✅ BM25 Best Match: {best_bm25_match}")
print(f"BM25 Scores: {bm25_scores}")

In [None]:
# 🔥 Use BERT QA on BM25 retrieved document
bm25_result = qa_pipeline(question=query, context=best_bm25_match)
print(f"🤖 Extracted Answer (BM25): {bm25_result['answer']}")

In [None]:
import faiss
import numpy as np

# Load Sentence Transformer DPR model
dpr_model = SentenceTransformer("facebook-dpr-ctx_encoder-single-nq-base")

# Encode documents and query
doc_embeddings = dpr_model.encode(documents, convert_to_numpy=True)
query_embedding = dpr_model.encode([query], convert_to_numpy=True)

# **Normalize embeddings for dot product retrieval**
faiss.normalize_L2(doc_embeddings)
faiss.normalize_L2(query_embedding)

# Use Inner Product Index (Instead of L2)
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)

# Perform search to get similarity scores
D, I = index.search(query_embedding, k=len(documents))  # Retrieve all docs

# Since FAISS returns inner product similarity, we can directly use these as DPR scores
dpr_similarity_scores = D[0]  # Extract similarity scores for the query

# Print the results
print(f"Sentence Transformers DPR Similarity Scores: {dpr_similarity_scores}")


In [None]:
# 🔥 Use BERT QA on DPR retrieved document
st_dpr_result = qa_pipeline(question=query, context=best_st_match)
print(f"🤖 Extracted Answer (Sentence Transformers DPR): {st_dpr_result['answer']}")

In [None]:
### === 3. Facebook DPR Implementation === ###
print("\n🔹 Using Facebook DPR")

# Load Facebook DPR models & tokenizers
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Encode the query
question_inputs = question_tokenizer(query, return_tensors="pt")
question_embedding = question_encoder(**question_inputs).pooler_output.detach().numpy()

# Encode documents
context_embeddings = []
for doc in documents:
    context_inputs = context_tokenizer(doc, return_tensors="pt")
    context_embedding = context_encoder(**context_inputs).pooler_output
    context_embeddings.append(context_embedding)

# Convert to NumPy and normalize
context_embeddings = torch.cat(context_embeddings).detach().numpy()
faiss.normalize_L2(context_embeddings)
faiss.normalize_L2(question_embedding)

# Compute similarity and retrieve best match
fb_scores = np.dot(question_embedding, context_embeddings.T).squeeze(0)
best_fb_index = np.argmax(fb_scores)
best_fb_match = documents[best_fb_index]

print(f"✅ Facebook DPR Best Match: {best_fb_match}")
print(f"Facebook DPR Similarity Scores: {fb_scores}")

In [None]:
# 🔥 Use BERT QA on Facebook DPR retrieved document
fb_dpr_result = qa_pipeline(question=query, context=best_fb_match)
print(f"🤖 Extracted Answer (Facebook DPR): {fb_dpr_result['answer']}")

In [None]:
print("Raw BM25 Scores:", bm25_scores)
print("Raw ST-DPR Scores:", D.flatten())
print("Raw Facebook DPR Scores:", fb_scores)


In [None]:
### === 4. Compare Models === ###
print("\n🔹 Comparing Models")

# Ensure indices are correctly defined
best_bm25_index = np.argmax(bm25_scores)  # Get best BM25 document index
best_st_index = np.argmax(D.flatten())  # Get best Sentence Transformer document index
best_fb_index = np.argmax(fb_scores)  # Get best Facebook DPR document index

# Normalize all retrieval scores across all documents
bm25_normalized = bm25_scores / np.max(bm25_scores) if np.max(bm25_scores) != 0 else np.zeros_like(bm25_scores)
st_normalized = D.flatten() / np.max(D.flatten()) if np.max(D.flatten()) != 0 else np.zeros_like(D.flatten())
fb_normalized = fb_scores / np.max(fb_scores) if np.max(fb_scores) != 0 else np.zeros_like(fb_scores)

# Combine scores into a dictionary (using document references)
comparison = {
    "BM25": (documents[best_bm25_index], bm25_normalized[best_bm25_index]),
    "Sentence Transformers DPR": (documents[best_st_index], st_normalized[best_st_index]),
    "Facebook DPR": (documents[best_fb_index], fb_normalized[best_fb_index])
}

# Determine the best model based on normalized similarity score
best_model = max(comparison, key=lambda x: comparison[x][1])

# Display results
for model, (match, score) in comparison.items():
    print(f"{model}: Match = '{match}', Score = {score:.4f}")

print(f"\n🏆 Best Model for Query: {best_model} (Match: {comparison[best_model][0]}, Score: {comparison[best_model][1]:.4f})")




In [None]:
### === 5. Visualization with Normalization === ###
import matplotlib.pyplot as plt
import seaborn as sns

# Define document labels
doc_labels = [f"Doc {i+1}" for i in range(len(documents))]

# Construct a heatmap using normalized scores across all documents
score_matrix = np.array([
    bm25_normalized,  # BM25 normalized scores for all documents
    st_normalized,    # ST-DPR normalized scores for all documents
    fb_normalized     # Facebook DPR normalized scores for all documents
])

plt.figure(figsize=(8, 5))
sns.heatmap(score_matrix, annot=True, xticklabels=doc_labels, yticklabels=["BM25", "ST-DPR", "FB-DPR"], cmap="coolwarm", fmt=".4f")
plt.xlabel("Documents")
plt.ylabel("Retrieval Model")
plt.title("Retrieval Score Heatmap (With Normalization)")
plt.show()

In [None]:
!pip install --upgrade datasets