In [54]:
import pandas as pd
import numpy as np
import os
import re
import wikipedia
import torch
import wikipediaapi
import time
import fitz
import json
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
def detect_section_title(text):
    lines = text.split("\n")
    for line in lines:
        line = line.strip()
        if re.match(r'^(Chapter \d+|[\dI]+\.\s+[A-Za-z\s]+)', line, re.IGNORECASE):
            return line
    return "unknown"

In [56]:
pdf_path = "history-of-the-filipino-people.pdf"
json_path = "knowledge_base/filipino_history_pdf.json"
directory = os.path.dirname(json_path)

# Create directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"[INFO] Created directory: {directory}")

[INFO] Created directory: knowledge_base


In [57]:
# Scrapping History of the Filipino People PDF
if os.path.exists(json_path):
    print(f"[INFO] Skipping PDF extraction – file already exists: {json_path}")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
else:
    print(f"[INFO] Extracting text from: {pdf_path}")
    doc = fitz.open(pdf_path)
    data = {}

    # Extract text from each page
    for page_number in tqdm(range(len(doc)), desc="Extracting PDF"):
        page = doc[page_number]
        text = page.get_text().strip()

        if text:
            # Detect section title (optional)
            section = detect_section_title(text)
            
            data[f"page_{page_number + 1}"] = {
                "page": page_number + 1,
                "content": text,
                "metadata": {
                    "source": "Agoncillo_textbook",
                    "document_type": "textbook",
                    "language": "English",
                    "section": section
                }
            }

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"[INFO] Saved PDF content to {json_path}")

[INFO] Extracting text from: history-of-the-filipino-people.pdf


Extracting PDF: 100%|██████████| 662/662 [00:00<00:00, 722.37it/s]


[INFO] Saved PDF content to knowledge_base/filipino_history_pdf.json


In [58]:
# Initialize Wikipedia API for English and Tagalog
wiki_en = wikipediaapi.Wikipedia(language='en', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')
wiki_tl = wikipediaapi.Wikipedia(language='tl', user_agent='WiQAS/1.0 (ralf_hernandez@dlsu.edu.ph)')

In [60]:
# Function to scrape all pages in a category recursively
def scrape_category(category, wiki, language, scraped_pages=None, depth=0, max_depth=3):
    if scraped_pages is None:
        scraped_pages = {}
    
    # Avoid infinite recursion
    if depth > max_depth:
        return scraped_pages
    
    # Get category members
    cat = wiki.page(f"Category:{category}")
    if not cat.exists():
        return scraped_pages
    
    # Iterate through category members
    for member_name, member_page in tqdm(cat.categorymembers.items(), desc=f"Scraping {category}"):
        if member_page.namespace == wikipediaapi.Namespace.CATEGORY:
            print(f"[INFO] Processing: {member_name} | Depth: {depth}")
            scrape_category(member_name.replace("Category:", ""), wiki, language, scraped_pages, depth + 1, max_depth)
        else:
            try:
                if member_page.exists() and member_name not in scraped_pages:
                    scraped_pages[member_name] = {
                        "title": member_page.title,
                        "content": member_page.text,
                        "metadata": {
                            "source": f"Wikipedia_{language}",
                            "language": "English" if language == "en" else "Tagalog",
                            "url": member_page.fullurl,
                            "category": category,
                            "namespace": "Main"
                        }
                    }
            except Exception as e:
                print(f"Error scraping {member_name}: {e}")
        time.sleep(0.1)  # Rate limiting
    
    return scraped_pages

# TAKES TOO LONG

In [61]:
# English Wikipedia scraping
# english_json_path = "knowledge_base/philippine_culture_en.json"

# if os.path.exists(english_json_path):
#     print(f"[INFO] Skipping English scrape – file already exists: {english_json_path}")
#     with open(english_json_path, 'r', encoding='utf-8') as f:
#         english_data = json.load(f)
# else:
#     print("\n[INFO] Scraping English category: Culture_of_the_Philippines")
#     english_data = scrape_category('Culture_of_the_Philippines', wiki_en, language="en")

#     with open(english_json_path, 'w', encoding='utf-8') as f:
#         json.dump(english_data, f, ensure_ascii=False, indent=2)
#     print(f"[INFO] Saved to {english_json_path}")

In [62]:
# Tagalog Wikipedia scraping
tagalog_json_path = "knowledge_base/philippine_culture_tl.json"

if os.path.exists(tagalog_json_path):
    print(f"[INFO] Skipping Tagalog scrape – file already exists: {tagalog_json_path}")
    with open(tagalog_json_path, 'r', encoding='utf-8') as f:
        tagalog_data = json.load(f)
else:
    print("\n[INFO] Scraping Tagalog category: Kultura_ng_Pilipinas")
    tagalog_data = scrape_category('Kultura_ng_Pilipinas', wiki_tl, language="tl")

    with open(tagalog_json_path, 'w', encoding='utf-8') as f:
        json.dump(tagalog_data, f, ensure_ascii=False, indent=2)
    print(f"[INFO] Saved to {tagalog_json_path}")


[INFO] Scraping Tagalog category: Kultura_ng_Pilipinas


Scraping Kultura_ng_Pilipinas:  82%|████████▏ | 93/114 [01:06<00:14,  1.47it/s]

[INFO] Processing: Kategorya:Gawad Ramon Magsaysay | Depth: 0


Scraping Kultura_ng_Pilipinas:  82%|████████▏ | 94/114 [01:06<00:11,  1.67it/s]

[INFO] Processing: Kategorya:Kultura ng Pilipinas ayon sa lalawigan | Depth: 0


Scraping Kultura_ng_Pilipinas:  83%|████████▎ | 95/114 [01:07<00:10,  1.86it/s]

[INFO] Processing: Kategorya:Kulturang Bisaya | Depth: 0


Scraping Kultura_ng_Pilipinas:  84%|████████▍ | 96/114 [01:07<00:08,  2.02it/s]

[INFO] Processing: Kategorya:Lutuing Pilipino | Depth: 0


Scraping Kultura_ng_Pilipinas:  85%|████████▌ | 97/114 [01:08<00:07,  2.14it/s]

[INFO] Processing: Kategorya:Mga awiting Pilipino | Depth: 0


Scraping Kultura_ng_Pilipinas:  86%|████████▌ | 98/114 [01:08<00:07,  2.24it/s]

[INFO] Processing: Kategorya:Mga manlililok mula sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  87%|████████▋ | 99/114 [01:08<00:06,  2.29it/s]

[INFO] Processing: Kategorya:Mga pambansang alagad ng sining ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  88%|████████▊ | 100/114 [01:09<00:05,  2.34it/s]

[INFO] Processing: Kategorya:Mga pambansang sagisag ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  89%|████████▊ | 101/114 [01:09<00:05,  2.38it/s]

[INFO] Processing: Kategorya:Mga pangkat-etniko sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  89%|████████▉ | 102/114 [01:10<00:05,  2.40it/s]

[INFO] Processing: Kategorya:Mga patimpalak sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  90%|█████████ | 103/114 [01:10<00:04,  2.41it/s]

[INFO] Processing: Kategorya:Mga sining panlaban ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  91%|█████████ | 104/114 [01:10<00:04,  2.45it/s]

[INFO] Processing: Kategorya:Mga tinapay ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  92%|█████████▏| 105/114 [01:11<00:03,  2.45it/s]

[INFO] Processing: Kategorya:Mitolohiyang Pilipino | Depth: 0


Scraping Kultura_ng_Pilipinas:  93%|█████████▎| 106/114 [01:11<00:03,  2.46it/s]

[INFO] Processing: Kategorya:Musika ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  94%|█████████▍| 107/114 [01:12<00:02,  2.46it/s]

[INFO] Processing: Kategorya:Palakasan sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  95%|█████████▍| 108/114 [01:12<00:02,  2.46it/s]

[INFO] Processing: Kategorya:Pambansang Alagad ng Sining ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  96%|█████████▌| 109/114 [01:13<00:02,  2.43it/s]

[INFO] Processing: Kategorya:Mga pelikula mula sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  96%|█████████▋| 110/114 [01:13<00:01,  2.46it/s]

[INFO] Processing: Kategorya:Relihiyon sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  97%|█████████▋| 111/114 [01:13<00:01,  2.48it/s]

[INFO] Processing: Kategorya:Sayaw sa Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  98%|█████████▊| 112/114 [01:14<00:00,  2.48it/s]

[INFO] Processing: Kategorya:Sining ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas:  99%|█████████▉| 113/114 [01:14<00:00,  2.47it/s]

[INFO] Processing: Kategorya:Mga wika ng Pilipinas | Depth: 0


Scraping Kultura_ng_Pilipinas: 100%|██████████| 114/114 [01:15<00:00,  1.52it/s]

[INFO] Saved to knowledge_base/philippine_culture_tl.json





In [63]:
kb_dir = "./knowledge_base"
processed_dir = "./processed_data"
os.makedirs(processed_dir, exist_ok=True)

In [64]:
# Define preprocessing function
def clean_text(text):
    # Normalize whitespace and punctuation
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.{3,}', '.', text)
    text = re.sub(r'\s+\.', '.', text)
    
    # Remove Wikipedia-specific noise
    text = re.sub(r'\{\{.*?\}\}', '', text)  # Remove templates
    text = re.sub(r'\[\[.*?\]\]', '', text)  # Remove internal links
    text = re.sub(r'\[edit\]', '', text)  # Remove edit tags
    text = re.sub(r'Category:.*|Kategorya:.*', '', text)  # Remove category tags
    
    # Remove common Wikipedia and textbook sections
    noisy_sections = [
        "See also", "References", "External links", "Mga sanggunian", 
        "Tingnan din", "Mga kawing panlabas", "Table of Contents", 
        "Preface", "Foreword", "Appendix", "Talaan ng mga nilalaman"
    ]
    for section in noisy_sections:
        if section.lower() in text.lower():
            text = text[:text.lower().find(section.lower())]
    
    # Remove short or low-value text
    text = text.strip()
    return text if len(text.split()) >= 5 else ""

In [None]:
# Process all JSON files
for filename in os.listdir(kb_dir):
    if filename.endswith(".json"):
        input_path = os.path.join(kb_dir, filename)
        output_path = os.path.join(processed_dir, filename)

        print(f"[INFO] Processing {filename}")
        with open(input_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        cleaned_data = {}

        for key, value in tqdm(raw_data.items(), desc=f"Cleaning {filename}"):
            text = value.get("content") or value.get("text") or ""
            cleaned = clean_text(text)
            if len(cleaned.split()) >= 5:
                metadata = value.get("metadata", {})
                if "source" not in metadata:
                    if "Agoncillo" in filename:
                        metadata["source"] = "Agoncillo_textbook"
                    elif "en" in filename:
                        metadata["source"] = "Wikipedia_en"
                    elif "tl" in filename:
                        metadata["source"] = "Wikipedia_tl"
                    else:
                        metadata["source"] = "unknown"
                metadata["source_file"] = filename
                cleaned_data[key] = {
                    **value,
                    "cleaned_content": cleaned,
                    "metadata": metadata
                }

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Cleaned file saved to: {output_path}")

[INFO] Processing filipino_history_pdf.json


Cleaning filipino_history_pdf.json: 100%|██████████| 645/645 [00:00<00:00, 3140.17it/s]


[INFO] Cleaned file saved to: ./processed_data\filipino_history_pdf.json
[INFO] Processing philippine_culture_tl.json


Cleaning philippine_culture_tl.json: 100%|██████████| 93/93 [00:00<00:00, 1705.78it/s]


[INFO] Cleaned file saved to: ./processed_data\philippine_culture_tl.json


In [None]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
# Deduplicate Chunks
def deduplicate_chunks(chunks_with_metadata, threshold=0.95):
    texts = [c["text"] for c in chunks_with_metadata]
    vectorizer = TfidfVectorizer().fit_transform(texts)
    similarities = cosine_similarity(vectorizer)
    unique_chunks = []
    seen = set()
    for i, chunk in enumerate(chunks_with_metadata):
        if i not in seen:
            unique_chunks.append(chunk)
            for j in range(i + 1, len(texts)):
                if similarities[i, j] > threshold:
                    seen.add(j)
    return unique_chunks

In [None]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=75,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [None]:
chunks_with_metadata = []
for key, doc in cleaned_data.items():
    text = doc["cleaned_content"]
    # Access metadata safely
    metadata = doc.get("metadata", {})
    source = metadata.get("source", "unknown")
    section = metadata.get("section", "unknown")
    split_chunks = splitter.split_text(text)
    chunks_with_metadata.extend([
        {
            "text": chunk,
            "metadata": {
                "source": source,
                "section": section,
                "source_file": metadata.get("source_file", "unknown"),
                "chunk_id": f"{key}_{i}"
            }
        }
        for i, chunk in enumerate(split_chunks)
    ])

KeyError: 'source'

In [None]:
# Deduplicate
chunks_with_metadata = deduplicate_chunks(chunks_with_metadata)
chunks = [c["text"] for c in chunks_with_metadata]
metadata = [c["metadata"] for c in chunks_with_metadata]

In [None]:
# Chroma Vector Store (Dense Retrieval)
vectorstore = Chroma.from_texts(
    texts=chunks,
    metadatas=metadata,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [None]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 5

In [None]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        vectorstore.as_retriever(search_kwargs={"k": 5}),
        bm25_retriever
    ],
    weights=[0.7, 0.3]
)

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    normalized_scores = [score / (len(doc.page_content.split()) + 1) for score, doc in zip(scores, docs)]
    scored_docs = list(zip(normalized_scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:300] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [None]:
# Test Output
query = "Ano ang ibig sabihin ng “bayanihan” sa kulturang Pilipino?"
# Retrieve documents using hybrid retriever
initial_results = ensemble_retriever.get_relevant_documents(query)

# Rerank using Cross-Encoder
reranked = rerank_with_cross_encoder(query, initial_results, top_n=3, verbose=True)


Rank 1 Score: 6.5149
. Ang ibig sabihin ng kultura ay ang paraan ng pamumuhay ng mga tao nagpapakita ng kaugalian, tradisyon, mga sining, sistema ng edukasyon, musika at pamahalaan. Lipunang Pilipino Ang Lipunang Pilipino ay magkahalong lipunan. Isa bilang bansa, at marami dahil sa pagkakahiwalay ng mga ito ng lugar, dahil sa pulo pulo nitong ayos at mga kasanayan...
------------------------------------------------------------

Rank 2 Score: 5.1147
. • Ang Pagaandukha na konsepto ng Sikolohiyang Pilipino ay ang pagkukuha ng salitang dayuhan at baguhin ang kanyang anyo hangga’t magkaroon siya ng Pilipinong kahulugan. • Ang konsepto ng Pagbibinyag sa Sikolohiyang Pilipino ay madali lang intindihan sapagkat ang ibig sabihin nito ay ang paglalagay ng mga dayuhan ng kanilang mga sariling kahulugan...
------------------------------------------------------------

Rank 3 Score: 3.5327
Isa bilang bansa, at marami dahil sa pagkakahiwalay ng mga ito ng lugar, dahil sa pulo pulo nitong ayos at mg