In [1]:
!pip install transformers rank-bm25 nltk

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [7]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [8]:
!pip install transformers datasets evaluate rank-bm25 tqdm

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
from transformers import pipeline
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import re
import evaluate
from tqdm import tqdm

def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

print("Loading SQuAD dataset...")
squad_dataset = load_dataset("squad", split="validation[:100]")

contexts = list(set(example["context"] for example in squad_dataset))
tokenized_contexts = [simple_tokenize(doc) for doc in contexts]
bm25 = BM25Okapi(tokenized_contexts)

print("Loading QA model...")
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def answer_user_question(user_question, top_k=3, min_score=0.4):
    tokenized_question = simple_tokenize(user_question)
    scores = bm25.get_scores(tokenized_question)

    best_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    top_contexts = [contexts[i] for i in best_indices]

    best_answer = {"score": 0, "answer": "No answer found", "context": ""}

    for ctx in top_contexts:
        try:
            result = qa_pipeline({
                "question": user_question,
                "context": ctx
            })
            if result["score"] > best_answer["score"]:
                best_answer = {
                    "score": result["score"],
                    "answer": result["answer"],
                    "context": ctx
                }
        except:
            continue

    if best_answer["score"] < min_score:
        return {
            "score": best_answer["score"],
            "answer": "Sorry, I couldn't find a reliable answer in the current dataset.",
            "context": ""
        }

    return best_answer

def evaluate_model():
    print("Running Evaluation on SQuAD subset...")
    squad_metric = evaluate.load("squad")
    predictions = []
    references = []

    for example in tqdm(squad_dataset):
        question = example["question"]

        tokenized_question = simple_tokenize(question)
        scores = bm25.get_scores(tokenized_question)
        best_idx = scores.argmax()
        context = contexts[best_idx]

        try:
            result = qa_pipeline({
                "question": question,
                "context": context
            })
            predicted_answer = result["answer"]
        except:
            predicted_answer = ""

        predictions.append({"prediction_text": predicted_answer, "id": example["id"]})
        references.append({"answers": example["answers"], "id": example["id"]})

    metrics = squad_metric.compute(predictions=predictions, references=references)
    print(f"\nEvaluation Results on SQuAD (100 samples):")
    print(f"Exact Match: {metrics['exact_match']:.2f}")
    print(f"F1 Score:    {metrics['f1']:.2f}")

def main():
    print("\nRunning Built-in Questions...\n")

    sample_questions = [
        "Who was the first president of the United States?",
        "What is the capital of France?",
        "How many planets are in the solar system?",
        "Who discovered penicillin?",
        "What is the boiling point of water?"
    ]

    for question in sample_questions:
        print(f"Question: {question}")
        answer = answer_user_question(question)
        print(f"Answer: {answer['answer']}")
        print(f"From Context: {answer['context'][:200]}...")
        print(f"Confidence Score: {answer['score']:.4f}\n")

    print("Evaluating model on SQuAD subset...\n")
    evaluate_model()

if __name__ == "__main__":
    main()


Loading SQuAD dataset...
Loading QA model...


Device set to use cpu



Running Built-in Questions...

Question: Who was the first president of the United States?
Answer: Cam Newton
From Context: The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship ...
Confidence Score: 0.9284

Question: What is the capital of France?
Answer: Sorry, I couldn't find a reliable answer in the current dataset.
From Context: ...
Confidence Score: 0.2526

Question: How many planets are in the solar system?
Answer: Sorry, I couldn't find a reliable answer in the current dataset.
From Context: ...
Confidence Score: 0.1590

Question: Who discovered penicillin?
Answer: Sorry, I couldn't find a reliable answer in the current dataset.
From Context: ...
Confidence Score: 0.3835

Question: What is the boiling point of water?
Answer: Sorry, I couldn't find a reliable answer in the current dataset.
From Context: ...
Confidence Score: 0.1434

E

100%|██████████| 100/100 [00:28<00:00,  3.48it/s]


Evaluation Results on SQuAD (100 samples):
Exact Match: 54.00
F1 Score:    56.71





In [15]:
!pip install wikipedia transformers

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=784868f9d001cee7bb9e33672e5cb6437756304b49b695dead4295116a332d34
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [30]:
import wikipedia
from transformers import pipeline
import re
from datasets import load_dataset
from tqdm import tqdm
import string

# Set Wikipedia language
wikipedia.set_lang("en")

# Load QA model
print("Loading QA model...")
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def clean_text(text):
    return re.sub(r'\s+', ' ', text)

def get_wikipedia_summary(query, sentences=5):
    try:
        search_results = wikipedia.search(query)
        if not search_results:
            return None
        page = wikipedia.page(search_results[0])
        content = page.content
        return clean_text(" ".join(content.split('.')[:sentences]) + '.')
    except Exception:
        return None

def answer_question(question, context, min_confidence=0.4):
    try:
        result = qa_pipeline({
            "question": question,
            "context": context
        })
        if result["score"] < min_confidence:
            result["answer"] = "Could not find a confident answer."
        return {
            "answer": result["answer"],
            "context": context[:300] + "...",
            "score": result["score"]
        }
    except Exception:
        return {
            "answer": "Error during QA processing.",
            "context": "",
            "score": 0
        }

# Custom metrics
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_f1(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(len(prediction_tokens) == len(ground_truth_tokens))
    if len(common) == 0:
        return 0
    precision = len(common) / len(prediction_tokens)
    recall = len(common) / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def evaluate_on_squad(n=100):
    print("Running Evaluation on SQuAD subset...")
    dataset = load_dataset("squad", split=f"validation[:{n}]")
    exact_match_score = 0
    f1_score = 0

    for sample in tqdm(dataset):
        question = sample["question"]
        context = sample["context"]
        true_answers = sample["answers"]["text"]
        true_answer = true_answers[0] if true_answers else ""

        result = answer_question(question, context)
        predicted = result["answer"]

        exact_match_score += compute_exact_match(predicted, true_answer)
        f1_score += compute_f1(predicted, true_answer)

    exact_match_avg = exact_match_score / len(dataset) * 100
    f1_avg = f1_score / len(dataset) * 100

    print(f"\nEvaluation Results on SQuAD ({n} samples):")
    print(f"Exact Match: {exact_match_avg:.2f}")
    print(f"F1 Score:    {f1_avg:.2f}")

def main():
    print("\nWikipedia QA System (DistilBERT)\n")

    built_in_questions = [
        "Who discovered gravity?",
        "What is the theory of relativity?",
        "Who painted the Mona Lisa?",
        "What is quantum physics?",
        "What causes climate change?"
    ]

    for question in built_in_questions:
        print(f"Question: {question}")
        context = get_wikipedia_summary(question)
        if context:
            result = answer_question(question, context)
            print(f"Answer: {result['answer']}")
            print(f"From Context: {result['context']}")
            print(f"Confidence Score: {result['score']:.4f}\n")
        else:
            print("Answer: No relevant Wikipedia content found.\n")

    # Automatically run evaluation on SQuAD
    evaluate_on_squad(100)

if __name__ == "__main__":
    main()


Loading QA model...


Device set to use cpu



Wikipedia QA System (DistilBERT)

Question: Who discovered gravity?
Answer: No relevant Wikipedia content found.

Question: What is the theory of relativity?
Answer: Could not find a confident answer.
From Context: In physics, the special theory of relativity, or special relativity for short, is a scientific theory of the relationship between space and time In Albert Einstein's 1905 paper, "On the Electrodynamics of Moving Bodies", the theory is presented as being based on just two postulates: The laws of phys...
Confidence Score: 0.2698

Question: Who painted the Mona Lisa?
Answer: Leonardo da Vinci
From Context: The Isleworth Mona Lisa is an early 16th-century oil on canvas painting depicting the same subject as Leonardo da Vinci's Mona Lisa, though with the subject (Lisa del Giocondo) depicted as being a younger age The painting is thought to have been brought from Italy to England in the 1780s, and came i...
Confidence Score: 0.9488

Question: What is quantum physics?
Answer: Coul

100%|██████████| 100/100 [00:28<00:00,  3.51it/s]


Evaluation Results on SQuAD (100 samples):
Exact Match: 68.00
F1 Score:    73.32





In [37]:
import wikipedia
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import re
import string
from datasets import load_dataset
from tqdm import tqdm
import time
import random

# Set Wikipedia language
wikipedia.set_lang("en")

# Load better QA model
print("Loading enhanced QA model...")
model_name = "deepset/roberta-base-squad2"  # Better model trained on SQuAD v2
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create pipeline with specific parameters
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    handle_impossible_answer=True
)

def clean_text(text):
    """Cleans excessive whitespaces in text."""
    return re.sub(r'\s+', ' ', text).strip()

def get_wikipedia_content(query, max_length=2000):
    """Search and fetch Wikipedia content for a query."""
    try:
        search_results = wikipedia.search(query)
        if not search_results:
            return None

        # Try the first result
        try:
            page = wikipedia.page(search_results[0], auto_suggest=False)
        except wikipedia.DisambiguationError as e:
            # If disambiguation page, take the first option
            if e.options:
                page = wikipedia.page(e.options[0], auto_suggest=False)
            else:
                return None

        # Get content
        content = page.content

        # Ensure we don't exceed the context window
        if len(content) > max_length:
            content = content[:max_length]

        return clean_text(content)
    except Exception as e:
        print(f"Wikipedia error: {e}")
        return None

def answer_question(question, context, min_confidence=0.4):
    """Answer a question given the context using the QA model with improved handling."""
    try:
        # Handle empty context
        if not context:
            return {
                "answer": "No relevant context found to answer the question.",
                "context": "",
                "score": 0
            }

        # Handle context length - ensure it's not too long
        if len(context) > 4000:
            context = context[:4000]

        # Get answer
        result = qa_pipeline({
            "question": question,
            "context": context
        })

        # Improved confidence handling
        if result["score"] < min_confidence:
            if "I don't know" in result["answer"] or len(result["answer"]) < 2:
                result["answer"] = "Could not find a confident answer in the provided context."

        return {
            "answer": result["answer"],
            "context": context[:300] + "..." if len(context) > 300 else context,
            "score": result["score"]
        }
    except Exception as e:
        print(f"QA error: {e}")
        return {
            "answer": "Error during QA processing.",
            "context": "",
            "score": 0
        }

# Custom evaluation functions
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_f1(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    # Calculate precision, recall, and F1
    common = set(prediction_tokens) & set(ground_truth_tokens)

    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        # If either is no-answer, then F1 is 1 if both are no-answer, 0 otherwise
        return int(len(prediction_tokens) == len(ground_truth_tokens))

    if len(common) == 0:
        return 0

    precision = len(common) / len(prediction_tokens)
    recall = len(common) / len(ground_truth_tokens)

    if precision + recall == 0:
        return 0

    f1 = 2 * precision * recall / (precision + recall)
    return f1

def evaluate_on_squad(n=100):
    """Evaluate the QA model on a subset of the SQuAD validation set."""
    print(f"\nRunning Evaluation on SQuAD ({n} samples)...")
    dataset = load_dataset("squad", split=f"validation[:{n}]")

    total_samples = len(dataset)
    exact_match_score = 0
    f1_score = 0

    for sample in tqdm(dataset):
        question = sample["question"]
        context = sample["context"]
        true_answers = sample["answers"]["text"]
        true_answer = true_answers[0] if true_answers else ""

        result = answer_question(question, context)
        predicted = result["answer"]

        # Calculate scores using our custom functions
        exact_match_score += compute_exact_match(predicted, true_answer)
        f1_score += compute_f1(predicted, true_answer)

    # Compute average scores
    exact_match_avg = exact_match_score / total_samples * 100
    f1_avg = f1_score / total_samples * 100

    print(f"\nEvaluation Results on SQuAD ({n} samples):")
    print(f"Exact Match: {exact_match_avg:.2f}")
    print(f"F1 Score:    {f1_avg:.2f}")

    return exact_match_avg, f1_avg

# Built-in test questions across different domains - reduced to 4
test_questions = [
    "Who invented the World Wide Web?",
    "What is the theory of relativity?",
    "Who was the first president of the United States?",
    "What is artificial intelligence?"
]

def run_built_in_questions():
    """Run tests on built-in questions and display results."""
    questions_to_run = test_questions

    print(f"\nRunning {len(questions_to_run)} built-in questions...\n")

    results = []
    for i, question in enumerate(questions_to_run):
        print(f"Question {i+1}: {question}")

        # Get context from Wikipedia
        context = get_wikipedia_content(question)

        # Get answer
        start_time = time.time()
        result = answer_question(question, context)
        elapsed_time = time.time() - start_time

        # Display results
        print(f"Answer: {result['answer']}")
        print(f"Confidence: {result['score']:.4f}")
        print(f"Response time: {elapsed_time:.2f} seconds")
        print(f"Context snippet: {result['context'][:150]}...")
        print("-" * 80)

        # Save results
        results.append({
            "question": question,
            "answer": result['answer'],
            "confidence": result['score'],
            "response_time": elapsed_time
        })

    # Calculate averages
    avg_confidence = sum(r['confidence'] for r in results) / len(results)
    avg_response_time = sum(r['response_time'] for r in results) / len(results)

    print("\nSummary:")
    print(f"Average confidence: {avg_confidence:.4f}")
    print(f"Average response time: {avg_response_time:.2f} seconds")

    return results

def main():
    """Main function to run the automatic QA system."""
    print("\nAutomated Wikipedia QA System")
    print("============================")

    # Run SQuAD evaluation
    print("\nPart 1: SQuAD Evaluation")
    squad_em, squad_f1 = evaluate_on_squad(100)

    # Run built-in questions
    print("\nPart 2: Built-in Questions Test")
    question_results = run_built_in_questions()

    # Display final summary
    print("\nFinal Summary")
    print("=============")
    print(f"SQuAD Evaluation Results:")
    print(f"- Exact Match: {squad_em:.2f}")
    print(f"- F1 Score: {squad_f1:.2f}")
    print(f"Built-in Questions Results:")
    print(f"- Questions tested: {len(question_results)}")
    print(f"- Average confidence: {sum(r['confidence'] for r in question_results) / len(question_results):.4f}")
    print(f"- Average response time: {sum(r['response_time'] for r in question_results) / len(question_results):.2f} seconds")

if __name__ == "__main__":
    main()

Loading enhanced QA model...


Device set to use cpu



Automated Wikipedia QA System

Part 1: SQuAD Evaluation

Running Evaluation on SQuAD (100 samples)...


100%|██████████| 100/100 [01:00<00:00,  1.64it/s]



Evaluation Results on SQuAD (100 samples):
Exact Match: 81.00
F1 Score:    84.24

Part 2: Built-in Questions Test

Running 4 built-in questions...

Question 1: Who invented the World Wide Web?
Answer: Tim Berners-Lee
Confidence: 0.9274
Response time: 2.46 seconds
Context snippet: The World Wide Web ("WWW", "W3" or simply "the Web") is a global information medium that users can access via computers connected to the Internet. The...
--------------------------------------------------------------------------------
Question 2: What is the theory of relativity?
Answer: special relativity and general relativity
Confidence: 0.1439
Response time: 2.26 seconds
Context snippet: The theory of relativity usually encompasses two interrelated physics theories by Albert Einstein: special relativity and general relativity, proposed...
--------------------------------------------------------------------------------
Question 3: Who was the first president of the United States?
Answer: George Washington
