In [None]:
import pandas as pd
import numpy as np
import os
import re
import wikipedia
import torch
import wikipediaapi
import time
import fitz
import json
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder

In [44]:
pdf_path = "history-of-the-filipino-people.pdf"
json_path = "knowledge_base/filipino_history_pdf.json"

In [45]:
# Scrapping History of the Filipino People PDF
if os.path.exists(json_path):
    print(f"[INFO] Skipping PDF extraction – file already exists: {json_path}")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
else:
    print(f"[INFO] Extracting text from: {pdf_path}")
    doc = fitz.open(pdf_path)
    data = {}

    # Extract text from each page
    for page_number in tqdm(range(len(doc)), desc="Extracting PDF"):
        page = doc[page_number]
        text = page.get_text().strip()

        if text:
            data[f"page_{page_number + 1}"] = {
                "page": page_number + 1,
                "content": text
            }

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"[INFO] Saved PDF content to {json_path}")

[INFO] Skipping PDF extraction – file already exists: knowledge_base/filipino_history_pdf.json


In [46]:
# Initialize Wikipedia API for English and Tagalog
wiki_en = wikipediaapi.Wikipedia(language='en', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')
wiki_tl = wikipediaapi.Wikipedia(language='tl', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')

In [None]:
# Function to scrape all pages in a category recursively
def scrape_category(category, wiki, scraped_pages=None, depth=0, max_depth=3):
    if scraped_pages is None:
        scraped_pages = {}
    
    # Avoid infinite recursion
    if depth > max_depth:
        return scraped_pages
    
    # Get category members
    cat = wiki.page(f"Category:{category}")
    if not cat.exists():
        return scraped_pages
    
    # Iterate through category members
    for member_name, member_page in tqdm(cat.categorymembers.items(), desc=f"Scraping {category}"):
        if member_page.namespace == wikipediaapi.Namespace.CATEGORY:
            # Recursively scrape subcategories
            print(f"[INFO] Processing: {member_name} | Depth: {depth}")
            scrape_category(member_name.replace("Category:", ""), wiki, scraped_pages, depth + 1, max_depth)
        else:
            # Process articles
            try:
                if member_page.exists() and member_name not in scraped_pages:
                    scraped_pages[member_name] = {
                        'title': member_page.title,
                        'content': member_page.text,
                        'url': member_page.fullurl
                    }
            except Exception as e:
                print(f"Error scraping {member_name}: {e}")
        time.sleep(0.1)  # Rate limiting
    
    return scraped_pages

In [None]:
# English Wikipedia scraping
english_json_path = "knowledge_base/philippine_culture_en.json"

if os.path.exists(english_json_path):
    print(f"[INFO] Skipping English scrape – file already exists: {english_json_path}")
    with open(english_json_path, 'r', encoding='utf-8') as f:
        english_data = json.load(f)
else:
    print("\n[INFO] Scraping English category: Culture_of_the_Philippines")
    english_data = scrape_category('Culture_of_the_Philippines', wiki_en)

    with open(english_json_path, 'w', encoding='utf-8') as f:
        json.dump(english_data, f, ensure_ascii=False, indent=2)
    print(f"[INFO] Saved to {english_json_path}")

In [48]:
# Tagalog Wikipedia scraping
tagalog_json_path = "knowledge_base/philippine_culture_tl.json"

if os.path.exists(tagalog_json_path):
    print(f"[INFO] Skipping Tagalog scrape – file already exists: {tagalog_json_path}")
    with open(tagalog_json_path, 'r', encoding='utf-8') as f:
        tagalog_data = json.load(f)
else:
    print("\n[INFO] Scraping Tagalog category: Kultura_ng_Pilipinas")
    tagalog_data = scrape_category('Kultura_ng_Pilipinas', wiki_tl)

    with open(tagalog_json_path, 'w', encoding='utf-8') as f:
        json.dump(tagalog_data, f, ensure_ascii=False, indent=2)
    print(f"[INFO] Saved to {tagalog_json_path}")

[INFO] Skipping Tagalog scrape – file already exists: knowledge_base/philippine_culture_tl.json


In [49]:
kb_dir = "knowledge_base"
processed_dir = "knowledge_base/cleaned"
os.makedirs(processed_dir, exist_ok=True)

In [50]:
# Define preprocessing function
def clean_text(text):
    # Basic cleaning
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'\.{3,}', '.', text)  # Collapse ellipsis
    text = re.sub(r'\s+\.', '.', text)  # Remove space before periods
    text = text.strip()

    # Remove common noise headers (language-dependent)
    for noisy_section in ["See also", "References", "Mga sanggunian", "Tingnan din"]:
        if noisy_section.lower() in text.lower():
            text = text[:text.lower().find(noisy_section.lower())]
    
    return text

In [None]:
# Process all JSON files
for filename in os.listdir(kb_dir):
    if filename.endswith(".json"):
        input_path = os.path.join(kb_dir, filename)
        output_path = os.path.join(processed_dir, filename)

        print(f"[INFO] Processing {filename}")
        with open(input_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        cleaned_data = {}

        for key, value in tqdm(raw_data.items(), desc=f"Cleaning {filename}"):
            text = value.get("content") or value.get("text") or ""
            cleaned = clean_text(text)
            if len(cleaned.split()) >= 5:
                cleaned_data[key] = {
                    **value,
                    "cleaned_content": cleaned
                }

        # Save cleaned output
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Cleaned file saved to: {output_path}")

[INFO] Processing filipino_history_pdf.json


Cleaning filipino_history_pdf.json: 100%|██████████| 645/645 [00:00<00:00, 4182.92it/s]


[INFO] Cleaned file saved to: knowledge_base/cleaned\filipino_history_pdf.json
[INFO] Processing philippine_culture_tl.json


Cleaning philippine_culture_tl.json: 100%|██████████| 93/93 [00:00<00:00, 2364.74it/s]

[INFO] Cleaned file saved to: knowledge_base/cleaned\philippine_culture_tl.json





In [52]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

  embedding_model = HuggingFaceEmbeddings(


In [54]:
# Chunking

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [55]:
chunks = []
for doc in cleaned_data.values():
    text = doc["cleaned_content"]
    split_chunks = splitter.split_text(text)
    chunks.extend(split_chunks)

In [56]:
# Chroma Vector Store (Dense Retrieval)
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [57]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 3

In [None]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        vectorstore.as_retriever(search_kwargs={"k": 3}),
        bm25_retriever
    ],
    weights=[0.85, 0.15]
)

In [82]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    scored_docs = list(zip(scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:350] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [90]:
# Test Output
query = "Ano ang nobelang isinulat ni Jose Rizal na tumatalakay sa mga Pilipino at kanilang kalagayan sa ilalim ng kolonyal na pamahalaan?"
# Retrieve documents using hybrid retriever
initial_results = ensemble_retriever.get_relevant_documents(query)

# Rerank using Cross-Encoder
reranked = rerank_with_cross_encoder(query, initial_results, top_n=3, verbose=True)


Rank 1 Score: 6.5966
Itinuturing ang kanyang pinakagrandeng akda, ang Florante at Laura bilang isa sa mga obra-maestra ng Panitikang Filipino. Isinulat ni Balagtas ang epiko noong nasa bilangguan siya. Si José Rizal, ang pambansang bayani, ay nagsulat ng mga nobelang Noli Me Tángere (Huwag Akong Salangin Nino Man) at El Filibusterismo (Ang Pilibusterismo, kilala rin bi...
------------------------------------------------------------

Rank 2 Score: 5.9323
. Nagsulat si Jose Rizal ng kanyang mga librong Noli Me Tangere at El filibusterismo kahit alam niyang na maaari siyang ipapatay ng mga Kastila kung makita nila ito. Dahil sa matapang na aksyon ni Dr. Rizal, nagising ang mga Filipino sa kaapihan na naranasan nila sa ilalim ng mga Kastila, at sinimulan nila ang mga rebolusyon upang makamit ang kalay...
------------------------------------------------------------

Rank 3 Score: 5.4439
. Upang kumita ng salapi, sumali si Amorsolo sa mga paligsahan at gumawa rin siya ng mga guhit para sa m