In [3]:
import pandas as pd
import numpy as np
import os
import re
import wikipedia
import torch
import wikipediaapi
import time
import fitz
import json
from tqdm import tqdm
from fuzzywuzzy import fuzz
from typing import List, Dict, Any, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
def detect_section_title(text):
    lines = text.split("\n")
    for line in lines:
        line = line.strip()
        if re.match(r'^(Chapter \d+|[\dI]+\.\s+[A-Za-z\s]+)', line, re.IGNORECASE):
            return line
    return "unknown"

In [5]:
pdf_path = "history-of-the-filipino-people.pdf"
json_path = "knowledge_base/filipino_history_pdf.json"
directory = os.path.dirname(json_path)

# Create directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"[INFO] Created directory: {directory}")

In [6]:
# Scrapping History of the Filipino People PDF
if os.path.exists(json_path):
    print(f"[INFO] Skipping PDF extraction – file already exists: {json_path}")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
else:
    print(f"[INFO] Extracting text from: {pdf_path}")
    doc = fitz.open(pdf_path)
    data = {}

    # Extract text from each page
    for page_number in tqdm(range(len(doc)), desc="Extracting PDF"):
        page = doc[page_number]
        text = page.get_text().strip()

        if text:
            # Detect section title (optional)
            section = detect_section_title(text)
            
            data[f"page_{page_number + 1}"] = {
                "page": page_number + 1,
                "content": text,
                "metadata": {
                    "source": "Agoncillo_textbook",
                    "document_type": "textbook",
                    "language": "English",
                    "section": section
                }
            }

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"[INFO] Saved PDF content to {json_path}")

[INFO] Skipping PDF extraction – file already exists: knowledge_base/filipino_history_pdf.json


In [7]:
# Initialize Wikipedia API for English and Tagalog
wiki_en = wikipediaapi.Wikipedia(language='en', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')
wiki_tl = wikipediaapi.Wikipedia(language='tl', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')

In [8]:
# Function to scrape all pages in a category recursively
def scrape_category(category, wiki, language, scraped_pages=None, depth=0, max_depth=3):
    if scraped_pages is None:
        scraped_pages = {}
    
    # Avoid infinite recursion
    if depth > max_depth:
        return scraped_pages
    
    # Get category members
    cat = wiki.page(f"Category:{category}")
    if not cat.exists():
        return scraped_pages
    
    # Iterate through category members
    for member_name, member_page in tqdm(cat.categorymembers.items(), desc=f"Scraping {category}"):
        if member_page.namespace == wikipediaapi.Namespace.CATEGORY:
            print(f"[INFO] Processing: {member_name} | Depth: {depth}")
            scrape_category(member_name.replace("Category:", ""), wiki, language, scraped_pages, depth + 1, max_depth)
        else:
            try:
                if member_page.exists() and member_name not in scraped_pages:
                    scraped_pages[member_name] = {
                        "title": member_page.title,
                        "content": member_page.text,
                        "metadata": {
                            "source": f"Wikipedia_{language}",
                            "language": "English" if language == "en" else "Tagalog",
                            "url": member_page.fullurl,
                            "category": category,
                            "namespace": "Main"
                        }
                    }
            except Exception as e:
                print(f"Error scraping {member_name}: {e}")
        time.sleep(0.1)  # Rate limiting
    
    return scraped_pages

# TAKES TOO LONG

In [9]:
# English Wikipedia scraping
# english_json_path = "knowledge_base/philippine_culture_en.json"

# if os.path.exists(english_json_path):
#     print(f"[INFO] Skipping English scrape – file already exists: {english_json_path}")
#     with open(english_json_path, 'r', encoding='utf-8') as f:
#         english_data = json.load(f)
# else:
#     print("\n[INFO] Scraping English category: Culture_of_the_Philippines")
#     english_data = scrape_category('Culture_of_the_Philippines', wiki_en, language="en")

#     with open(english_json_path, 'w', encoding='utf-8') as f:
#         json.dump(english_data, f, ensure_ascii=False, indent=2)
#     print(f"[INFO] Saved to {english_json_path}")

In [10]:
# Tagalog Wikipedia scraping
tagalog_json_path = "knowledge_base/philippine_culture_tl.json"

if os.path.exists(tagalog_json_path):
    print(f"[INFO] Skipping Tagalog scrape – file already exists: {tagalog_json_path}")
    with open(tagalog_json_path, 'r', encoding='utf-8') as f:
        tagalog_data = json.load(f)
else:
    print("\n[INFO] Scraping Tagalog category: Kultura_ng_Pilipinas")
    tagalog_data = scrape_category('Kultura_ng_Pilipinas', wiki_tl, language="tl")

    with open(tagalog_json_path, 'w', encoding='utf-8') as f:
        json.dump(tagalog_data, f, ensure_ascii=False, indent=2)
    print(f"[INFO] Saved to {tagalog_json_path}")

[INFO] Skipping Tagalog scrape – file already exists: knowledge_base/philippine_culture_tl.json


In [11]:
kb_dir = "./knowledge_base"
processed_dir = "./processed_data"
os.makedirs(processed_dir, exist_ok=True)

In [12]:
# Define preprocessing function
def clean_text(text):
    # Normalize whitespace and punctuation
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.{3,}', '.', text)
    text = re.sub(r'\s+\.', '.', text)
    
    # Remove Wikipedia-specific noise
    text = re.sub(r'\{\{.*?\}\}', '', text)  # Remove templates
    text = re.sub(r'\[\[.*?\]\]', '', text)  # Remove internal links
    text = re.sub(r'\[edit\]', '', text)  # Remove edit tags
    text = re.sub(r'Category:.*|Kategorya:.*', '', text)  # Remove category tags
    
    # Remove common Wikipedia and textbook sections
    noisy_sections = [
        "See also", "References", "External links", "Mga sanggunian", 
        "Tingnan din", "Mga kawing panlabas", "Table of Contents", 
        "Preface", "Foreword", "Appendix", "Talaan ng mga nilalaman"
    ]
    for section in noisy_sections:
        if section.lower() in text.lower():
            text = text[:text.lower().find(section.lower())]
    
    # Remove short or low-value text
    text = text.strip()
    return text if len(text.split()) >= 5 else ""

In [13]:
# Process all JSON files
for filename in os.listdir(kb_dir):
    if filename.endswith(".json"):
        input_path = os.path.join(kb_dir, filename)
        output_path = os.path.join(processed_dir, filename)

        print(f"[INFO] Processing {filename}")
        with open(input_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        cleaned_data = {}

        for key, value in tqdm(raw_data.items(), desc=f"Cleaning {filename}"):
            text = value.get("content") or value.get("text") or ""
            cleaned = clean_text(text)
            if len(cleaned.split()) >= 5:
                metadata = value.get("metadata", {})
                if "source" not in metadata:
                    if "Agoncillo" in filename:
                        metadata["source"] = "Agoncillo_textbook"
                    elif "en" in filename:
                        metadata["source"] = "Wikipedia_en"
                    elif "tl" in filename:
                        metadata["source"] = "Wikipedia_tl"
                    else:
                        metadata["source"] = "unknown"
                metadata["source_file"] = filename
                cleaned_data[key] = {
                    **value,
                    "cleaned_content": cleaned,
                    "metadata": metadata
                }

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Cleaned file saved to: {output_path}")

[INFO] Processing filipino_history_pdf.json


Cleaning filipino_history_pdf.json: 100%|██████████| 645/645 [00:00<00:00, 3489.69it/s]


[INFO] Cleaned file saved to: ./processed_data\filipino_history_pdf.json
[INFO] Processing philippine_culture_tl.json


Cleaning philippine_culture_tl.json: 100%|██████████| 93/93 [00:00<00:00, 1921.06it/s]

[INFO] Cleaned file saved to: ./processed_data\philippine_culture_tl.json





In [14]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

  embedding_model = HuggingFaceEmbeddings(


In [15]:
# Deduplicate Chunks
def deduplicate_chunks(chunks_with_metadata, threshold=0.95):
    texts = [c["text"] for c in chunks_with_metadata]
    vectorizer = TfidfVectorizer().fit_transform(texts)
    similarities = cosine_similarity(vectorizer)
    unique_chunks = []
    seen = set()
    for i, chunk in enumerate(chunks_with_metadata):
        if i not in seen:
            unique_chunks.append(chunk)
            for j in range(i + 1, len(texts)):
                if similarities[i, j] > threshold:
                    seen.add(j)
    return unique_chunks

In [16]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=75,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [17]:
chunks_with_metadata = []
for key, doc in cleaned_data.items():
    text = doc["cleaned_content"]
    # Access metadata safely
    metadata = doc.get("metadata", {})
    source = metadata.get("source", "unknown")
    section = metadata.get("section", "unknown")
    split_chunks = splitter.split_text(text)
    chunks_with_metadata.extend([
        {
            "text": chunk,
            "metadata": {
                "source": source,
                "section": section,
                "source_file": metadata.get("source_file", "unknown"),
                "chunk_id": f"{key}_{i}"
            }
        }
        for i, chunk in enumerate(split_chunks)
    ])

In [18]:
# Deduplicate
chunks_with_metadata = deduplicate_chunks(chunks_with_metadata)
chunks = [c["text"] for c in chunks_with_metadata]
metadata = [c["metadata"] for c in chunks_with_metadata]

In [19]:
# Chroma Vector Store (Dense Retrieval)
vectorstore = Chroma.from_texts(
    texts=chunks,
    metadatas=metadata,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [20]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 3

In [21]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        vectorstore.as_retriever(search_kwargs={"k": 3}),
        bm25_retriever
    ],
    weights=[0.7, 0.3]
)

In [22]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    normalized_scores = [score / (len(doc.page_content.split()) + 1) for score, doc in zip(scores, docs)]
    scored_docs = list(zip(normalized_scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:300] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [23]:
def load_eval_data(file_path: str) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [24]:
def is_relevant(ground_truth: str, doc_content: str, threshold: float = 80) -> bool:
    ground_truth = ground_truth.lower().strip()
    doc_content = doc_content.lower().strip()

    # Exact substring match
    if ground_truth in doc_content:
        return True

    # Fuzzy matching for paraphrased content
    similarity = fuzz.partial_ratio(ground_truth, doc_content)
    return similarity >= threshold

In [40]:
def evaluate_retriever(
    eval_data: List[Dict[str, Any]],
    retriever: Any,
    reranker: Any,
    k: int = 3,
    fuzzy_threshold: float = 70,
    doc_content_key: str = 'page_content'
) -> Dict[str, float]:
    hits = 0  # Count of queries with at least one relevant document
    relevant_docs_total = 0  # Total number of relevant documents retrieved
    retrieved_docs_total = 0  # Total number of documents retrieved
    reciprocal_ranks = []  # Reciprocal ranks for MRR

    for sample in tqdm(eval_data, desc="Evaluating"):
        question = sample["question"]
        ground_truth = sample["answer"]

        try:
            # Retrieve initial documents
            initial_results = retriever.get_relevant_documents(question)
            if not initial_results:
                reciprocal_ranks.append(0)
                continue

            # Rerank
            reranked = reranker(question, initial_results, top_n=k)
            if not reranked:
                reciprocal_ranks.append(0)
                continue

            # Track relevance
            found = False
            retrieved_docs_total += len(reranked)
            for rank, (score, doc) in enumerate(reranked[:k]):
                try:
                    # Extract document content dynamically
                    doc_content = getattr(doc, doc_content_key, doc) if isinstance(doc, object) else doc
                    if isinstance(doc_content, dict):
                        doc_content = doc_content.get('content', '')

                    # Check relevance
                    if is_relevant(ground_truth, doc_content, fuzzy_threshold):
                        relevant_docs_total += 1
                        if not found:
                            hits += 1
                            reciprocal_ranks.append(1 / (rank + 1))
                            found = True

                except AttributeError as e:
                    print(f"Warning: Could not access document content for rank {rank}: {e}")
                    continue

            if not found:
                reciprocal_ranks.append(0)

        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            reciprocal_ranks.append(0)
            continue

    # Calculate metrics
    total_queries = len(eval_data)
    recall = hits / total_queries if total_queries > 0 else 0.0
    mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0

    return {
        f"Recall@{k}": recall,
        f"MRR@{k}": mrr
    }


In [None]:
eval_data = load_eval_data("test.json")

results = evaluate_retriever(
    eval_data=eval_data,
    retriever=ensemble_retriever,
    reranker=rerank_with_cross_encoder,
    k=3,
    fuzzy_threshold=80,
    doc_content_key='page_content'
)

Evaluating: 100%|██████████| 3/3 [00:01<00:00,  2.66it/s]


In [58]:
print("\nEvaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")


Evaluation Results:
Recall@3: 0.6667
MRR@3: 0.5000


In [54]:
# Test Output
query = "Who was the first President of the Philippines?"

# Retrieve documents using hybrid retriever
initial_results = ensemble_retriever.get_relevant_documents(query)

# Rerank using Cross-Encoder
reranked = rerank_with_cross_encoder(query, initial_results, top_n=2)

for i, (score, doc) in enumerate(reranked, start=1):
    print(f"\nRank {i} | Score: {score:.4f}")
    print("-" * 60)
    print(doc.page_content.strip()[:300])
    print("-" * 60)


Rank 1 | Score: -0.0278
------------------------------------------------------------
Tangi sa pampalagiang suliranin na pangkabuhayan at pampamahalaan ang lumigalig sa kanyang pangasiwaan. == Pamumuno ni Ferdinand Marcos (1965–1986) ==

Ang isa pang subyang sa panig ni Macapagal ay ang Pangulo ng Senado na si Ferdinand Marcos, isang kapwa Liberal. Sinasabing upang matamo ang pagtulo
------------------------------------------------------------

Rank 2 | Score: -0.0726
------------------------------------------------------------
Tacloban at suporta galing sa Tanggapan ng Bise Presidente para sa Usaping Pang-Akademiko ng Unibersidad ng Pilipinas (Inggles: University of the Philippines Office of the Vice President for Academic Affairs) sa tulong ng Pondo para sa Pagpapataas ng Makasining na Paggawa at Pagsasaliksik (Inggles:
------------------------------------------------------------
