In [1]:
import pandas as pd
import numpy as np
import os
import re
import torch
import time
import fitz
import json
import datetime
from tqdm import tqdm
from fuzzywuzzy import fuzz
from typing import List, Dict, Any, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_dir = "knowledge_base"
output_dir = "data"
result_dir = "results"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"[INFO] Created directory: {output_dir}")

# Create result directory if it doesn't exist
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    print(f"[INFO] Created directory: {result_dir}")

In [3]:
for filename in os.listdir(input_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        json_filename = os.path.splitext(filename)[0] + ".json"
        json_path = os.path.join(output_dir, json_filename)

        # Skip if already converted
        if os.path.exists(json_path):
            print(f"[INFO] Skipping {filename} – already converted.")
            continue

        print(f"[INFO] Extracting: {filename}")
        doc = fitz.open(pdf_path)
        data = {}

        for page_number in tqdm(range(len(doc)), desc=f"Processing {filename}"):
            page = doc[page_number]
            text = page.get_text().strip()
            if text:
                data[f"page_{page_number + 1}"] = {
                    "page": page_number + 1,
                    "content": text
                }

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Saved to: {json_path}")

[INFO] Skipping A-History-Of-The-Philippines.pdf – already converted.
[INFO] Skipping Arkitekturang-Filipino.pdf – already converted.
[INFO] Skipping Culture-And-Customs-Of-The-Philippines.pdf – already converted.
[INFO] Skipping Filipino-Politics.pdf – already converted.
[INFO] Skipping Food-Of-The-Philippines.pdf – already converted.
[INFO] Skipping History-Of-The-Flipino-People.pdf – already converted.
[INFO] Skipping Philippine-History-Source-Book.pdf – already converted.
[INFO] Skipping Philippine-Myths-Legends-And-Folktales.pdf – already converted.
[INFO] Skipping Tikim-Essays-On-Philippine-Food.pdf – already converted.


In [None]:
# Define preprocessing function
def clean_text(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix ellipses or multiple punctuation
    text = re.sub(r'\.{3,}', '.', text)
    text = re.sub(r'\s+\.', '.', text)
    
    # Remove stray characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.strip()
    
    return text

In [5]:
data_dir = "data"

# Process all JSON files
for filename in os.listdir(data_dir):
    if filename.lower().endswith(".json"):
        json_path = os.path.join(data_dir, filename)
        print(f"[INFO] Preprocessing: {filename}")
        
        # Load existing data
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Clean text for each page
        for key in tqdm(data, desc=f"Cleaning {filename}"):
            if "content" in data[key]:
                data[key]["content"] = clean_text(data[key]["content"])
        
        # Overwrite file
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        print(f"[INFO] Finished cleaning: {filename}")

[INFO] Preprocessing: A-History-Of-The-Philippines.json


Cleaning A-History-Of-The-Philippines.json: 100%|██████████| 356/356 [00:00<00:00, 7382.48it/s]

[INFO] Finished cleaning: A-History-Of-The-Philippines.json
[INFO] Preprocessing: Arkitekturang-Filipino.json



Cleaning Arkitekturang-Filipino.json: 100%|██████████| 623/623 [00:00<00:00, 8508.46it/s]


[INFO] Finished cleaning: Arkitekturang-Filipino.json
[INFO] Preprocessing: Culture-And-Customs-Of-The-Philippines.json


Cleaning Culture-And-Customs-Of-The-Philippines.json: 100%|██████████| 273/273 [00:00<00:00, 7453.22it/s]


[INFO] Finished cleaning: Culture-And-Customs-Of-The-Philippines.json
[INFO] Preprocessing: Filipino-Politics.json


Cleaning Filipino-Politics.json: 100%|██████████| 379/379 [00:00<00:00, 4866.65it/s]


[INFO] Finished cleaning: Filipino-Politics.json
[INFO] Preprocessing: Food-Of-The-Philippines.json


Cleaning Food-Of-The-Philippines.json: 100%|██████████| 89/89 [00:00<00:00, 7898.71it/s]


[INFO] Finished cleaning: Food-Of-The-Philippines.json
[INFO] Preprocessing: History-Of-The-Flipino-People.json


Cleaning History-Of-The-Flipino-People.json: 100%|██████████| 645/645 [00:00<00:00, 7223.08it/s]


[INFO] Finished cleaning: History-Of-The-Flipino-People.json
[INFO] Preprocessing: Philippine-History-Source-Book.json


Cleaning Philippine-History-Source-Book.json: 100%|██████████| 643/643 [00:00<00:00, 6684.31it/s]


[INFO] Finished cleaning: Philippine-History-Source-Book.json
[INFO] Preprocessing: Philippine-Myths-Legends-And-Folktales.json


Cleaning Philippine-Myths-Legends-And-Folktales.json: 100%|██████████| 148/148 [00:00<00:00, 12998.24it/s]


[INFO] Finished cleaning: Philippine-Myths-Legends-And-Folktales.json
[INFO] Preprocessing: Tikim-Essays-On-Philippine-Food.json


Cleaning Tikim-Essays-On-Philippine-Food.json: 0it [00:00, ?it/s]

[INFO] Finished cleaning: Tikim-Essays-On-Philippine-Food.json





In [6]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

  embedding_model = HuggingFaceEmbeddings(


In [7]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [8]:
chunks = []
metadatas = []

# Loop through each cleaned JSON file
for filename in os.listdir(data_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(data_dir, filename)
        print(f"[INFO] Chunking file: {filename}")

        with open(file_path, "r", encoding="utf-8") as f:
            doc = json.load(f)

        for key, entry in tqdm(doc.items(), desc=f"Chunking {filename}"):
            text = entry.get("content", "")
            if not text.strip():
                continue

            split_chunks = splitter.split_text(text)

            for i, chunk in enumerate(split_chunks):

                if len(chunk.split()) <= 10:
                    continue
                
                chunks.append(chunk)
                metadatas.append({
                    "source": filename,
                    "page": entry.get("page", key),
                    "chunk_id": f"{key}_chunk_{i}",
                    "filename": filename
                })

print(f"[INFO] Total chunks: {len(chunks)}")

[INFO] Chunking file: A-History-Of-The-Philippines.json


Chunking A-History-Of-The-Philippines.json: 100%|██████████| 356/356 [00:00<00:00, 6626.16it/s]


[INFO] Chunking file: Arkitekturang-Filipino.json


Chunking Arkitekturang-Filipino.json: 100%|██████████| 623/623 [00:00<00:00, 8685.94it/s]


[INFO] Chunking file: Culture-And-Customs-Of-The-Philippines.json


Chunking Culture-And-Customs-Of-The-Philippines.json: 100%|██████████| 273/273 [00:00<00:00, 7934.95it/s]


[INFO] Chunking file: Filipino-Politics.json


Chunking Filipino-Politics.json: 100%|██████████| 379/379 [00:00<00:00, 5143.39it/s]


[INFO] Chunking file: Food-Of-The-Philippines.json


Chunking Food-Of-The-Philippines.json: 100%|██████████| 89/89 [00:00<00:00, 10941.56it/s]


[INFO] Chunking file: History-Of-The-Flipino-People.json


Chunking History-Of-The-Flipino-People.json: 100%|██████████| 645/645 [00:00<00:00, 7686.81it/s]


[INFO] Chunking file: Philippine-History-Source-Book.json


Chunking Philippine-History-Source-Book.json: 100%|██████████| 643/643 [00:00<00:00, 6555.27it/s]


[INFO] Chunking file: Philippine-Myths-Legends-And-Folktales.json


Chunking Philippine-Myths-Legends-And-Folktales.json: 100%|██████████| 148/148 [00:00<00:00, 30274.92it/s]


[INFO] Chunking file: Tikim-Essays-On-Philippine-Food.json


Chunking Tikim-Essays-On-Philippine-Food.json: 0it [00:00, ?it/s]

[INFO] Total chunks: 36709





In [9]:
# Chroma Vector Store (Dense Retrieval)
vectorstore = Chroma.from_texts(
    texts=chunks,
    metadatas=metadatas,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)   

In [10]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 3

In [11]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        vectorstore.as_retriever(search_kwargs={"k": 3}),
        bm25_retriever
    ],
    weights=[0.7, 0.3]
)

In [12]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    normalized_scores = [score / (len(doc.page_content.split()) + 1) for score, doc in zip(scores, docs)]
    scored_docs = list(zip(normalized_scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:300] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [13]:
def load_eval_data(file_path: str) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [14]:
def jaccard_similarity(a: str, b: str) -> float:
    a_tokens = set(a.lower().split())
    b_tokens = set(b.lower().split())

    if not a_tokens or not b_tokens:
        return 0.0
    
    intersection = a_tokens.intersection(b_tokens)
    union = a_tokens.union(b_tokens)
    
    return len(intersection) / len(union)

In [15]:
def is_relevant(ground_truth: str, doc_content: str, threshold: float = 50, jaccard_threshold: float = 0.3) -> bool:
    ground_truth = ground_truth.lower().strip()
    doc_content = doc_content.lower().strip()

    # Exact substring match
    if ground_truth in doc_content or doc_content in ground_truth:
        return True

    # Bi-directional fuzzy match
    similarity_1 = fuzz.partial_ratio(ground_truth, doc_content)
    similarity_2 = fuzz.partial_ratio(doc_content, ground_truth)
    if max(similarity_1, similarity_2) >= threshold:
        return True
    
    # Jaccard similarity
    jaccard = jaccard_similarity(ground_truth, doc_content)
    if jaccard >= jaccard_threshold:
        return True
    
    return False

In [19]:
def evaluate_retriever(
    eval_data: List[Dict[str, Any]],
    retriever: Any,
    reranker: Any,
    k: int = 3,
    fuzzy_threshold: float = 50,
    doc_content_key: str = 'page_content'
) -> Dict[str, float]:
    hits = 0       # Queries with at least one relevant document
    relevant_docs_total = 0
    retrieved_docs_total = 0
    reciprocal_ranks = []  
    results_log = []

    os.makedirs("results", exist_ok=True)
    date_str = datetime.datetime.now().strftime("%m-%d-%Y")
    output_path = f"results/result-{date_str}.json"

    for sample in tqdm(eval_data, desc="Evaluating"):
        question = sample["question"]
        ground_truth = sample["answer"]

        try:
            # Retrieve initial documents
            initial_results = retriever.get_relevant_documents(question)
            if not initial_results:
                reciprocal_ranks.append(0)
                continue

            # Rerank
            reranked = reranker(question, initial_results, top_n=k)
            if not reranked:
                reciprocal_ranks.append(0)
                continue

            # Track relevance
            found = False
            retrieved_docs_total += len(reranked)

             # Log top result
            top_doc = reranked[0][1]
            top_content = getattr(top_doc, doc_content_key, top_doc)
            if isinstance(top_content, dict):
                top_content = top_content.get('content', '')

            for rank, (score, doc) in enumerate(reranked[:k]):
                # Extract document content dynamically
                doc_content = getattr(doc, doc_content_key, doc) if isinstance(doc, object) else doc
                if isinstance(doc_content, dict):
                    doc_content = doc_content.get('content', '')

                # Check relevance
                if is_relevant(ground_truth, doc_content, fuzzy_threshold):
                    relevant_docs_total += 1
                    if not found:
                        hits += 1
                        reciprocal_ranks.append(1 / (rank + 1))
                        found = True

            if not found:
                reciprocal_ranks.append(0)

            results_log.append({
                "query": question,
                "ground_truth": ground_truth,
                "doc_content": top_content.strip()
            })

        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            reciprocal_ranks.append(0)
            results_log.append({
                "query": question,
                "ground_truth": ground_truth,
                "doc_content": f"[ERROR] {str(e)}"
            })
            continue

    # Save to results JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results_log, f, ensure_ascii=False, indent=2)

    print(f"\n[INFO] Results saved to {output_path}")

    # Calculate metrics
    total_queries = len(eval_data)
    recall = hits / total_queries if total_queries > 0 else 0.0
    precision = relevant_docs_total / retrieved_docs_total if retrieved_docs_total > 0 else 0.0
    mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0

    return {
        f"Recall@{k}": recall,
        f"Precision@{k}": precision,
        f"MRR@{k}": mrr
    }


In [20]:
eval_data = load_eval_data("evaluation/evaluation200.json")

results = evaluate_retriever(
    eval_data=eval_data,
    retriever=ensemble_retriever,
    reranker=rerank_with_cross_encoder,
    k=3,
    fuzzy_threshold=50,
    doc_content_key='page_content'
)

  initial_results = retriever.get_relevant_documents(question)
Evaluating: 100%|██████████| 200/200 [01:46<00:00,  1.88it/s]


[INFO] Results saved to results/result-06-23-2025.json





In [None]:
print("\nEvaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")


Evaluation Results:
Recall@3: 0.6700
Precision@3: 0.4133
MRR@3: 0.5567


In [None]:
# Test Output
query = "What is the traditional Filipino dish ‘kinilaw’?"

# Retrieve documents using hybrid retriever
initial_results = ensemble_retriever.get_relevant_documents(query)

# Rerank using Cross-Encoder
reranked = rerank_with_cross_encoder(query, initial_results, top_n=2)

for i, (score, doc) in enumerate(reranked, start=1):
    print(f"\nRank {i} | Score: {score:.4f}")
    print("-" * 60)
    print(doc.page_content.strip()[:300])
    print("-" * 60)


Rank 1 | Score: 0.3078
------------------------------------------------------------
. Kinilaw is another preserving process that has produced an appetizing dish that is the Filipino version of the Spanish seviche
------------------------------------------------------------

Rank 2 | Score: 0.2302
------------------------------------------------------------
. This method, known as kinilaw, also makes use of the abundant seafood of the country and adds variety to the Filipinos  taste choices
------------------------------------------------------------
