In [22]:
import pandas as pd
import numpy as np
import gc
import os
import re
import sys
import time
import fitz
import json
import torch
import shutil
import random
import datetime
import requests
import multiprocessing
from tqdm import tqdm
from statistics import mean
from fuzzywuzzy import fuzz
from collections import Counter
from typing import List, Dict, Any, Tuple, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
input_dir = "knowledge_base"
output_dir = "data"
result_dir = "results"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"[INFO] Created directory: {output_dir}")

# Create result directory if it doesn't exist
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    print(f"[INFO] Created directory: {result_dir}")

In [24]:
#Text extraction from PDF files
for filename in os.listdir(input_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        json_filename = os.path.splitext(filename)[0] + ".json"
        json_path = os.path.join(output_dir, json_filename)

        # Skip if already converted
        if os.path.exists(json_path):
            print(f"[INFO] Skipping {filename} – already converted.")
            continue

        print(f"[INFO] Extracting: {filename}")
        doc = fitz.open(pdf_path)
        data = {}

        for page_number in tqdm(range(len(doc)), desc=f"Processing {filename}"):
            page = doc[page_number]
            text = page.get_text().strip()
            if text:
                data[f"page_{page_number + 1}"] = {
                    "page": page_number + 1,
                    "content": text
                }

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Saved to: {json_path}")

[INFO] Skipping A-History-Of-The-Philippines.pdf – already converted.


In [25]:
# Define preprocessing function
def clean_text(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix ellipses or multiple punctuation
    text = re.sub(r'\.{3,}', '.', text)
    text = re.sub(r'\s+\.', '.', text)
    
    # Remove stray characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.strip()
    
    return text

In [26]:
data_dir = "data"

# Process all JSON files
for filename in os.listdir(data_dir):
    if filename.lower().endswith(".json"):
        json_path = os.path.join(data_dir, filename)
        print(f"[INFO] Preprocessing: {filename}")
        
        # Load existing data
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Clean text for each page
        for key in tqdm(data, desc=f"Cleaning {filename}"):
            if "content" in data[key]:
                data[key]["content"] = clean_text(data[key]["content"])
        
        # Overwrite file
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        print(f"[INFO] Finished cleaning: {filename}")

[INFO] Preprocessing: A-History-Of-The-Philippines.json


Cleaning A-History-Of-The-Philippines.json: 100%|██████████| 356/356 [00:00<00:00, 7955.74it/s]

[INFO] Finished cleaning: A-History-Of-The-Philippines.json





In [27]:
data_dir = "data"
model_name = "gemma3:latest"
ollama_url = "http://localhost:11434/api/generate"

In [28]:
def classify_with_gemma(text, model=model_name):
    prompt = (
        f"Given the following page content:\n\n\"\"\"\n{text.strip()[:1500]}\n\"\"\"\n\n"
        "Categorize this page broadly in one word, and generate 2 to 3 relevant tags. "
        "Return only this format:\n"
        "Category: <category>\nTags: <comma-separated tags>"
    )

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    try:
        response = requests.post(ollama_url, json=payload)
        if response.status_code == 200:
            return response.json()["response"]
        else:
            print(f"[ERROR] Ollama responded with status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] Ollama request failed: {e}")
        return None

In [29]:
# Classify a single page
def classify_and_update(page_id, entry):
    text = entry.get("content", "").strip()
    if not text or "category" in entry:
        return page_id, None
    
    response = classify_with_gemma(text)
    if response:
        lines = response.strip().split("\n")
        category, tags = "unknown", []
        for line in lines:
            if line.lower().startswith("category:"):
                category = line.split(":", 1)[1].strip()
            elif line.lower().startswith("tags:"):
                tags = [tag.strip() for tag in line.split(":", 1)[1].split(",")]
        
        entry["category"] = category
        entry["tags"] = tags
        return page_id, entry
    return page_id, None

In [30]:
physical_cores = multiprocessing.cpu_count() // 2
max_threads = min(physical_cores * 2, 32)

print(f"[INFO] Using max_workers = {max_threads}")

[INFO] Using max_workers = 12


In [31]:
# Apply classification to all pages
for filename in os.listdir(data_dir):
    if filename.endswith(".json"):
        path = os.path.join(data_dir, filename)
        print(f"\n[INFO] Adding Gemma metadata to: {filename}")

        with open(path, "r", encoding="utf-8") as f:
            doc = json.load(f)

        # Multithreading for pages in the doc
        with ThreadPoolExecutor(max_workers=max_threads) as executor:
            futures = [executor.submit(classify_and_update, pid, doc[pid]) for pid in doc]
            for future in tqdm(futures, desc=f"Classifying {filename}"):
                page_id, result = future.result()
                if result:
                    doc[page_id] = result

        # Save updated doc
        with open(path, "w", encoding="utf-8") as f:
            json.dump(doc, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Updated and saved: {filename}")


[INFO] Adding Gemma metadata to: A-History-Of-The-Philippines.json


Classifying A-History-Of-The-Philippines.json: 100%|██████████| 356/356 [00:00<00:00, 1089906.73it/s]

[INFO] Updated and saved: A-History-Of-The-Philippines.json





In [32]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

In [33]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [34]:
chunks = []
metadatas = []
tag_counter = Counter()

# Loop through each cleaned JSON file
for filename in os.listdir(data_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(data_dir, filename)
        print(f"[INFO] Chunking file: {filename}")

        with open(file_path, "r", encoding="utf-8") as f:   
            doc = json.load(f)

        for key, entry in tqdm(doc.items(), desc=f"Chunking {filename}"):
            text = entry.get("content", "")
            if not text.strip():
                continue

            split_chunks = splitter.split_text(text)

            # Get and clean category
            category = entry.get("category", "unknown")
            category = str(category).strip()

            # Get and clean tags
            raw_tags = entry.get("tags", [])
            if not isinstance(raw_tags, list):
                raw_tags = [raw_tags]

            tags = [str(tag).strip() for tag in raw_tags if isinstance(tag, (str, int, float, bool))]
            tags = [tag for tag in tags if tag]
            tag_counter.update(tags)

            for i, chunk in enumerate(split_chunks):
                if len(chunk.split()) <= 10:
                    continue
                
                chunks.append(chunk)
                metadatas.append({
                    "source": filename,
                    "page": entry.get("page", key),
                    "chunk_id": f"{key}_chunk_{i}",
                    "filename": filename,
                    "category": category,
                    "tags": ", ".join(tags)
                })

print(f"[INFO] Total chunks: {len(chunks)}")

[INFO] Chunking file: A-History-Of-The-Philippines.json


Chunking A-History-Of-The-Philippines.json: 100%|██████████| 356/356 [00:00<00:00, 10214.68it/s]

[INFO] Total chunks: 4222





In [35]:
# Batch insert initialization
batch_size = 256

def embed_batch(batch_texts):
    return embedding_model.embed_documents(batch_texts)

batches = [(chunks[i:i+batch_size], metadatas[i:i+batch_size])
           for i in range(0, len(chunks), batch_size)]

In [36]:
# Parallel embedding and collection
embedded_batches = []
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = {executor.submit(embed_batch, texts): (texts, metas)
               for texts, metas in batches}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Embedding"):
        try:
            embeddings = future.result()
            texts, metas = futures[future]
            embedded_batches.append((texts, metas, embeddings))
        except Exception as e:
            print(f"[ERROR] Failed to embed batch: {e}")

Embedding: 100%|██████████| 17/17 [06:18<00:00, 22.25s/it]  


In [38]:
# Clear previous DB
if os.path.exists("./chroma_db"):
    try:
        # Delete existing vectorstore if already defined
        if 'vectorstore' in locals():
            del vectorstore
            gc.collect()

        shutil.rmtree("./chroma_db")
        print("[INFO] Successfully removed previous Chroma DB.")
    except Exception as e:
        print(f"[WARN] Could not fully clean Chroma DB: {e}")
else:
    print("[INFO] No existing Chroma DB to remove.")

[WARN] Could not fully clean Chroma DB: [WinError 32] The process cannot access the file because it is being used by another process: './chroma_db\\bf452668-cc3a-4df7-bc04-9e69d0f8f8e0\\data_level0.bin'


In [40]:
# Initialize Chroma
vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [42]:
# Add batches to vectorstore
for texts, metas, embeds in tqdm(embedded_batches, desc="Indexing into Chroma"):
    vectorstore.add_texts(texts=texts, metadatas=metas, embeddings=embeds)

print("[INFO] Finished indexing Chroma vector store.")

Indexing into Chroma: 100%|██████████| 17/17 [07:54<00:00, 27.93s/it]

[INFO] Finished indexing Chroma vector store.





In [43]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 3

In [45]:
def load_eval_data(
    file_path: str, 
    randomize: bool = False, 
    limit: Optional[int] = None
) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if randomize:
        random.shuffle(data)

    if limit is not None:
        data = data[:limit]

    return data

In [46]:
def jaccard_similarity(a: str, b: str) -> float:
    a_tokens = set(a.lower().split())
    b_tokens = set(b.lower().split())

    if not a_tokens or not b_tokens:
        return 0.0
    
    intersection = a_tokens.intersection(b_tokens)
    union = a_tokens.union(b_tokens)
    
    return len(intersection) / len(union)

In [47]:
def is_relevant(ground_truth: str, doc_content: str, threshold: float = 50, jaccard_threshold: float = 0.3) -> bool:
    ground_truth = ground_truth.lower().strip()
    doc_content = doc_content.lower().strip()

    # Exact substring match
    if ground_truth in doc_content or doc_content in ground_truth:
        return True

    # Bi-directional fuzzy match
    similarity_1 = fuzz.partial_ratio(ground_truth, doc_content)
    similarity_2 = fuzz.partial_ratio(doc_content, ground_truth)
    if max(similarity_1, similarity_2) >= threshold:
        return True
    
    # Jaccard similarity
    jaccard = jaccard_similarity(ground_truth, doc_content)
    if jaccard >= jaccard_threshold:
        return True
    
    return False

In [48]:
def classify_query_with_gemma(question: str) -> dict:
    prompt = (
        f"Given the following question:\n\n\"{question}\"\n\n"
        "Classify this question with:\n"
        "Category: <a broad category in one word>\n"
        "Tags: <comma-separated 2 to 3 relevant keywords>"
    )
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(ollama_url, json=payload)
    if response.status_code == 200:
        output = response.json()["response"]
        category, tags = "unknown", []
        for line in output.strip().split("\n"):
            if line.lower().startswith("category:"):
                category = line.split(":", 1)[1].strip().strip("*")
            elif line.lower().startswith("tags:"):
                tags = [t.strip().strip("*") for t in line.split(":", 1)[1].split(",")]
        return {"category": category, "tags": tags}
    else:
        print(f"[ERROR] Failed to classify: {question}")
        return {"category": "unknown", "tags": []}

In [63]:
def retrieve_by_metadata(query: str,
                         category: str,
                         tags: List[str],
                         top_k: int = 3,
                         verbose = False,
                         category_threshold: float = 0.6,
                         fuzzy_tag_threshold: int = 80) -> List[Any]:

    category_embedding = embedding_model.embed_query(category)
    candidate_docs = vectorstore.similarity_search(query=query, k=50)

    filtered_docs = []
    seen = set()

    category_pass = 0
    tag_pass = 0
    total_skipped_no_meta = 0
    total_seen = 0

    for doc in candidate_docs:
        total_seen += 1
        doc_meta = doc.metadata or {}

        # --- Category similarity check ---
        doc_cat = doc_meta.get("category", "").strip()
        if not doc_cat:
            total_skipped_no_meta += 1
            continue

        doc_cat_embedding = embedding_model.embed_query(doc_cat)
        cat_sim = cosine_similarity([category_embedding], [doc_cat_embedding])[0][0]

        if cat_sim < category_threshold:
            continue
        category_pass += 1

        # --- Tag fuzzy matching (allow partial matches) ---
        doc_tags = doc_meta.get("tags", "")
        if isinstance(doc_tags, str):
            doc_tags = [t.strip() for t in doc_tags.split(",") if t.strip()]
        if not doc_tags:
            continue

        tag_scores = [fuzz.partial_ratio(tag.lower(), dt.lower())
                      for tag in tags for dt in doc_tags]
        if not tag_scores:
            continue

        max_tag_score = max(tag_scores)
        avg_top3_score = mean(sorted(tag_scores, reverse=True)[:3])

        if max_tag_score < fuzzy_tag_threshold and avg_top3_score < (fuzzy_tag_threshold - 10):
            continue
        tag_pass += 1

        # Deduplicate by content
        snippet = doc.page_content[:50].strip().lower()
        if snippet in seen:
            continue

        seen.add(snippet)
        filtered_docs.append(doc)
        if len(filtered_docs) >= top_k:
            break

    # --- Logging ---
    if verbose:
        print(f"\n[INFO] Retrieved: {len(candidate_docs)} Top: candidates for query: \"{query[:128]}...\"")
        print(f"[INFO] Skipped (missing metadata): {total_skipped_no_meta}")
        print(f"[INFO] Passed category filter: {category_pass}")
        print(f"[INFO] Passed tag filter: {tag_pass}")
        print(f"[INFO] Final top-k after deduplication: {len(filtered_docs)}\n")

    return filtered_docs

In [101]:
def evaluate_retriever(
    eval_data: list,
    reranker,
    k=3,
    fuzzy_threshold=70,
    jaccard_threshold=0.4,
    doc_content_key="page_content"
):
    import sys

    hits = 0
    relevant_docs_total = 0
    retrieved_docs_total = 0
    reciprocal_ranks = []
    results_flat = []

    os.makedirs("results", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%m-%d-%Y_%H-%M-%S")
    csv_path = f"results/result_{timestamp}.csv"

    tqdm_params = dict(
        desc="Evaluating",
        dynamic_ncols=True,
        file=sys.stdout,
        leave=True,
        mininterval=900.0   # 15 minutes
    )

    for idx, sample in enumerate(tqdm(eval_data, **tqdm_params)):
        question = sample["question"]
        ground_truth = sample["answer"]

        classification = classify_query_with_gemma(question)
        category = classification["category"]
        tags = classification["tags"]

        if not category or category.lower() == "unknown":
            print(f"[WARN] Question {idx+1} got weak category from Gemma: {question[:60]}...")

        try:
            initial_docs = retrieve_by_metadata(question, category, tags, top_k=k, verbose=False)
            if not initial_docs:
                reciprocal_ranks.append(0)
                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": 1,
                    "score": "N/A",
                    "found": False,
                    "doc_content": "[NO DOCUMENTS RETRIEVED]"
                })
                continue

            reranked = reranker(question, initial_docs, top_n=k)
            if not reranked:
                reciprocal_ranks.append(0)
                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": 1,
                    "score": "N/A",
                    "found": False,
                    "doc_content": "[NO DOCUMENTS RERANKED]"
                })
                continue

            found = False
            retrieved_docs_total += len(reranked)

            for rank, (score, doc) in enumerate(reranked):
                doc_content = getattr(doc, doc_content_key, doc)
                if isinstance(doc_content, dict):
                    doc_content = doc_content.get("content", "")

                if is_relevant(ground_truth, doc_content, fuzzy_threshold, jaccard_threshold):
                    relevant_docs_total += 1
                    if not found:
                        hits += 1
                        reciprocal_ranks.append(1 / (rank + 1))
                        found = True

                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": rank + 1,
                    "score": f"{score:.4f}",
                    "found": found,
                    "doc_content": doc_content[:500]
                })

            # If nothing relevant was found, log rank 1 again with found=False
            if not found:
                reciprocal_ranks.append(0)
                if reranked:
                    score, doc = reranked[0]
                    doc_content = getattr(doc, doc_content_key, doc)
                    if isinstance(doc_content, dict):
                        doc_content = doc_content.get("content", "")

                    results_flat.append({
                        "item": idx + 1,
                        "query": question,
                        "ground_truth": ground_truth,
                        "category": category,
                        "tags": ", ".join(tags),
                        "rank": 1,
                        "score": f"{score:.4f}",
                        "found": False,
                        "doc_content": doc_content[:500]
                    })

        except Exception as e:
            reciprocal_ranks.append(0)
            results_flat.append({
                "item": idx + 1,
                "query": question,
                "ground_truth": ground_truth,
                "category": category,
                "tags": ", ".join(tags),
                "rank": "error",
                "score": "N/A",
                "found": False,
                "doc_content": f"[ERROR] {str(e)}"
            })

    # Save output
    df = pd.DataFrame(results_flat)
    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"\n[INFO] Saved CSV results to: {csv_path}")

    total_queries = len(eval_data)
    metrics = {
        f"Recall@{k}": hits / total_queries if total_queries > 0 else 0.0,
        f"Precision@{k}": relevant_docs_total / retrieved_docs_total if retrieved_docs_total > 0 else 0.0,
        f"MRR@{k}": np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0
    }

    return metrics, csv_path

In [102]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    normalized_scores = [score / (len(doc.page_content.split()) + 1) for score, doc in zip(scores, docs)]
    scored_docs = list(zip(normalized_scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:300] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [103]:
eval_data = load_eval_data("evaluation.json", randomize=False, limit=3)

results, csv_path = evaluate_retriever(
    eval_data=eval_data,
    reranker=rerank_with_cross_encoder,
    k=3,
    fuzzy_threshold=70,
    jaccard_threshold = 0.4,
    doc_content_key='page_content'
)

Evaluating: 100%|██████████| 3/3 [00:33<00:00, 11.06s/it]

[INFO] Saved CSV results to: results/result_06-27-2025_20-51-44.csv


In [104]:
print("\nEvaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")


Evaluation Results:
Recall@3: 0.3333
Precision@3: 0.1667
MRR@3: 0.3333


# View Data Frame

In [105]:
df = pd.read_csv(csv_path)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

df.head(5)

Unnamed: 0,item,query,ground_truth,category,tags,rank,score,found,doc_content
0,1,Who is the national hero of the Philippines?,Jose Rizal,History,"Philippines, Hero, Nationalism",1,0.176,True,". The nineteenth-century revolutionary General Artemio Ricarte once proposed naming the country the Rizaline Islands, after its foremost national hero, Jos Rizal (with Filipinos henceforth to be known as Rizalinos)"
1,1,Who is the national hero of the Philippines?,Jose Rizal,History,"Philippines, Hero, Nationalism",2,0.0311,True,". Aguinaldo declared the independence of the Philippines on June 12 Commodore Dewey was invited to the momentous occasion but he begged off, saying it was mail day and for the first time, the Philippine flag was displayed and the national anthem sung"
2,1,Who is the national hero of the Philippines?,Jose Rizal,History,"Philippines, Hero, Nationalism",3,-0.1318,True,". Supportive of the Japanese, he was a living symbol of the Filipino struggle for independence"
3,2,Ano ang ibig sabihin ng ‘bayanihan’ sa kulturang Pilipino?,"Ang bayanihan ay ang tradisyon ng pagtutulungan at pagkakaisa sa komunidad, tulad ng sama-samang paglilipat ng bahay-kubo.",Culture,"Filipino, Tradition, Community",1,-0.2663,False,". Filipino immigrant communities formed mostly bachelor societies women were discouraged from emigrating, primarily to forestall the formation of families and the putting down of roots and would essentially remain so until the end of World War II"
4,2,Ano ang ibig sabihin ng ‘bayanihan’ sa kulturang Pilipino?,"Ang bayanihan ay ang tradisyon ng pagtutulungan at pagkakaisa sa komunidad, tulad ng sama-samang paglilipat ng bahay-kubo.",Culture,"Filipino, Tradition, Community",2,-0.268,False,". In this in-between state, they were defined as Philippine citizens, under the protection of the U.S., free to enter the country and look for work. On the other hand, they couldn t vote, own land, or marry white women"


# SINGLE QUERIES

In [106]:
# Test Output
query = "What is the most famous Filipino dish?"

In [107]:
# Classify with Gemma
classification = classify_query_with_gemma(query)
category = classification["category"]
tags = classification["tags"]

print(f"[INFO] Category: {category}")
print(f"[INFO] Tags: {tags}")

[INFO] Category: Food
[INFO] Tags: ['Filipino cuisine', 'Adobo', 'Adobo recipes']


In [108]:
# Retrieve documents by category and tags
initial_docs = retrieve_by_metadata(query, category, tags, top_k=10, verbose=True)

if not initial_docs:
    print("[WARN] No documents found after filtering.")
else:
    reranked = rerank_with_cross_encoder(query, initial_docs, top_n=3)


[INFO] Retrieved: 50 Top: candidates for query: "What is the most famous Filipino dish?..."
[INFO] Skipped (missing metadata): 0
[INFO] Passed category filter: 2
[INFO] Passed tag filter: 0
[INFO] Final top-k after deduplication: 0

[WARN] No documents found after filtering.


In [109]:
# Print output
for i, (score, doc) in enumerate(reranked, start=1):
    print(f"\nRank {i} | Score: {score:.4f}")
    print("-" * 60)
    print(doc.page_content.strip()[:300])
    print("-" * 60)


Rank 1 | Score: -0.1060
------------------------------------------------------------
. Unable to attend to their fields, the huddled population suffered an inordinate rate of death due to malnutrition and disease, such as dengue fever and malaria. In Batangas alone, 54,000 civilians died by the end of 1901
------------------------------------------------------------

Rank 2 | Score: -0.1207
------------------------------------------------------------
. Thence, Islam spread to Maranao territory (around Lake Danao), inexplicably skipped the Visayas for the most part, and sailed north to Mindoro Island and Maynila and Batangas in Luzon, and west to southern Palawan
------------------------------------------------------------

Rank 3 | Score: -0.2222
------------------------------------------------------------
.) The very nature of the archipelago, with more water than land, meant the existence of numerous tribal groups, other than the dominant ones the Tagalogs, the Kapampangans, the Ilo