In [21]:
import pandas as pd
import json
from pyserini.search.lucene import LuceneSearcher
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Load the NQ-Open dataset from Hugging Face
splits = {"validation": "nq_open/validation-00000-of-00001.parquet"}  # Define the validation split file
df_validation = pd.read_parquet("hf://datasets/google-research-datasets/nq_open/" + splits["validation"])

# Extract questions and answers
questions = df_validation["question"].tolist()
answers = df_validation["answer"].tolist()

# Load BM25 searcher
searcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr-100w')

# Load the extractive QA model
qa_model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

def clean_answer(answer):
    """Ensure the extracted answer is concise and properly formatted."""
    answer = answer.strip().split(".")[0]  # Remove long explanations
    answer = answer.replace(",", "").strip()  # Remove commas
    answer = " ".join(answer.split()[:5])  # Limit to 5 words max
    return answer

def answer_question_bm25(question, top_k=20, min_score=0.0):
    hits = searcher.search(question, k=top_k)
    if not hits:
        return "No answer found"
    
    # Extract and limit document context
    best_doc = json.loads(searcher.doc(hits[0].docid).raw())["contents"]
    best_doc = " ".join(best_doc.split()[:150])  # Keep only first 150 words

    qa_input = {"question": question, "context": best_doc}
    result = qa_pipeline(qa_input)
    
    # Apply confidence filtering
    if result["score"] < min_score:
        return "No answer found"

    return clean_answer(result["answer"])


def exact_match(pred, true):
    return int(any(pred.strip().lower() == t.strip().lower() for t in true))

# Evaluation function for BM25
def evaluate_bm25(n_samples=50):
    results = []
    correct = 0
    for i in range(n_samples):
        pred = answer_question_bm25(questions[i])
        is_correct = exact_match(pred, answers[i])
        correct += is_correct
        results.append({
            "question": questions[i],
            "true_answers": ", ".join(answers[i]),
            "predicted": pred,
            "correct": is_correct
        })
    accuracy = correct / n_samples
    df_results = pd.DataFrame(results)
    df_results.to_csv("outputs/bm25_predictions.csv", index=False)
    return accuracy

# Run BM25 evaluation
accuracy_bm25 = evaluate_bm25(3600)
print(f"Exact Match Accuracy (BM25): {accuracy_bm25:.2f}")


Device set to use cpu


Exact Match Accuracy (BM25): 0.08


In [13]:
import torch
import pandas as pd
import json
import requests
from langchain_ollama import ChatOllama
from langchain_core.messages import AIMessage
from pyserini.search.lucene import LuceneSearcher
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Load the NQ-Open dataset from Hugging Face
splits = {"validation": "nq_open/validation-00000-of-00001.parquet"}  # Define the validation split file
df_validation = pd.read_parquet("hf://datasets/google-research-datasets/nq_open/" + splits["validation"])

# Extract questions and answers
questions = df_validation["question"].tolist()
answers = df_validation["answer"].tolist()

# Load BM25 searcher
searcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr-100w')

# Load the extractive QA model
qa_model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

def clean_answer(answer):
    """Clean the extracted answer by removing unnecessary punctuation and whitespace."""
    return answer.strip().split(".")[0] + ","

def answer_question_bm25(question, top_k=5):
    hits = searcher.search(question, k=top_k)
    if not hits:
        return "No answer found"
    
    # Extract best document
    best_doc = json.loads(searcher.doc(hits[0].docid).raw())["contents"]
    qa_input = {"question": question, "context": best_doc}
    result = qa_pipeline(qa_input)
    return clean_answer(result["answer"])

def exact_match(pred, true):
    return int(any(pred.strip().lower() == t.strip().lower() for t in true))

# Evaluation function for BM25
def evaluate_bm25(n_samples=50):
    results = []
    correct = 0
    for i in range(n_samples):
        pred = answer_question_bm25(questions[i])
        is_correct = exact_match(pred, answers[i])
        correct += is_correct
        results.append({
            "question": questions[i],
            "true_answers": ", ".join(answers[i]),
            "predicted": pred,
            "correct": is_correct
        })
    accuracy = correct / n_samples
    df_results = pd.DataFrame(results)
    df_results.to_csv("bm25_predictions.csv", index=False)
    return accuracy

# Run BM25 evaluation
accuracy_bm25 = evaluate_bm25(10)
print(f"Exact Match Accuracy (BM25): {accuracy_bm25:.2f}")


Device set to use cpu


Exact Match Accuracy (BM25): 0.00
