In [None]:
import pandas as pd
import numpy as np
import gc
import os
import re
import sys
import time
import fitz
import json
import torch
import shutil
import random
import datetime
import requests
import multiprocessing
from tqdm import tqdm
from statistics import mean
from fuzzywuzzy import fuzz
from collections import Counter
from typing import List, Dict, Any, Tuple, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import EnsembleRetriever
from sentence_transformers import CrossEncoder
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
input_dir = "knowledge_base"
data_dir = "data"
result_dir = "results"
html_dir = "html"

In [None]:
# Create output directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"[INFO] Created directory: {data_dir}")

# Create result directory if it doesn't exist
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    print(f"[INFO] Created directory: {result_dir}")

# Create html directory if it doesn't exist
if not os.path.exists(html_dir):
    os.makedirs(html_dir)
    print(f"[INFO] Created directory: {html_dir}")

In [None]:
#Text extraction from PDF files
for filename in os.listdir(input_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        json_filename = os.path.splitext(filename)[0] + ".json"
        json_path = os.path.join(data_dir, json_filename)

        # Skip if already converted
        if os.path.exists(json_path):
            print(f"[INFO] Skipping {filename} – already converted.")
            continue

        print(f"[INFO] Extracting: {filename}")
        doc = fitz.open(pdf_path)
        data = {}

        for page_number in tqdm(range(len(doc)), desc=f"Processing {filename}"):
            page = doc[page_number]
            text = page.get_text().strip()
            if text:
                data[f"page_{page_number + 1}"] = {
                    "page": page_number + 1,
                    "content": text
                }

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Saved to: {json_path}")

In [None]:
# Define preprocessing function
def clean_text(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix ellipses or multiple punctuation
    text = re.sub(r'\.{3,}', '.', text)
    text = re.sub(r'\s+\.', '.', text)
    
    # Remove stray characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.strip()
    
    return text

In [None]:
data_dir = "data"

# Process all JSON files
for filename in os.listdir(data_dir):
    if filename.lower().endswith(".json"):
        json_path = os.path.join(data_dir, filename)
        print(f"[INFO] Preprocessing: {filename}")
        
        # Load existing data
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Clean text for each page
        for key in tqdm(data, desc=f"Cleaning {filename}"):
            if "content" in data[key]:
                data[key]["content"] = clean_text(data[key]["content"])
        
        # Overwrite file
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        print(f"[INFO] Finished cleaning: {filename}")

In [None]:
data_dir = "data"
model_name = "gemma3:latest"
ollama_url = "http://localhost:11434/api/generate"

In [None]:
def classify_with_gemma(text, model=model_name):
    prompt = (
        f"Given the following page content:\n\n\"\"\"\n{text.strip()[:1500]}\n\"\"\"\n\n"
        "Categorize this page broadly in one word, and generate 2 to 3 relevant tags. "
        "Return only this format:\n"
        "Category: <category>\nTags: <comma-separated tags>"
    )

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    try:
        response = requests.post(ollama_url, json=payload)
        if response.status_code == 200:
            return response.json()["response"]
        else:
            print(f"[ERROR] Ollama responded with status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] Ollama request failed: {e}")
        return None

In [None]:
# Classify a single page
def classify_and_update(page_id, entry):
    text = entry.get("content", "").strip()
    if not text or "category" in entry:
        return page_id, None
    
    response = classify_with_gemma(text)
    if response:
        lines = response.strip().split("\n")
        category, tags = "unknown", []
        for line in lines:
            if line.lower().startswith("category:"):
                category = line.split(":", 1)[1].strip()
            elif line.lower().startswith("tags:"):
                tags = [tag.strip() for tag in line.split(":", 1)[1].split(",")]
        
        entry["category"] = category
        entry["tags"] = tags
        return page_id, entry
    return page_id, None

In [None]:
# Determine number of threads based on CPU cores
physical_cores = multiprocessing.cpu_count() // 2
max_threads = min(physical_cores * 2, 32)

print(f"[INFO] Using max_workers = {max_threads}")

In [None]:
# Apply classification to all pages
for filename in os.listdir(data_dir):
    if filename.endswith(".json"):
        path = os.path.join(data_dir, filename)
        print(f"\n[INFO] Adding Gemma metadata to: {filename}")

        with open(path, "r", encoding="utf-8") as f:
            doc = json.load(f)

        # Multithreading for pages in the doc
        with ThreadPoolExecutor(max_workers=max_threads) as executor:
            futures = [executor.submit(classify_and_update, pid, doc[pid]) for pid in doc]
            for future in tqdm(futures, desc=f"Classifying {filename}"):
                page_id, result = future.result()
                if result:
                    doc[page_id] = result

        # Save updated doc
        with open(path, "w", encoding="utf-8") as f:
            json.dump(doc, f, ensure_ascii=False, indent=2)

        print(f"[INFO] Updated and saved: {filename}")

In [None]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", "?", "!", " ", ""]
)

In [None]:
chunks = []
metadatas = []
tag_counter = Counter()

# Loop through each cleaned JSON file
for filename in os.listdir(data_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(data_dir, filename)
        print(f"[INFO] Chunking file: {filename}")

        with open(file_path, "r", encoding="utf-8") as f:   
            doc = json.load(f)

        for key, entry in tqdm(doc.items(), desc=f"Chunking {filename}"):
            text = entry.get("content", "")
            if not text.strip():
                continue

            split_chunks = splitter.split_text(text)

            # Get and clean category
            category = entry.get("category", "unknown")
            category = str(category).strip()

            # Get and clean tags
            raw_tags = entry.get("tags", [])
            if not isinstance(raw_tags, list):
                raw_tags = [raw_tags]

            tags = [str(tag).strip() for tag in raw_tags if isinstance(tag, (str, int, float, bool))]
            tags = [tag for tag in tags if tag]
            tag_counter.update(tags)

            for i, chunk in enumerate(split_chunks):
                if len(chunk.split()) <= 10:
                    continue
                
                chunks.append(chunk)
                metadatas.append({
                    "source": filename,
                    "page": entry.get("page", key),
                    "chunk_id": f"{key}_chunk_{i}",
                    "filename": filename,
                    "category": category,
                    "tags": ", ".join(tags)
                })

print(f"[INFO] Total chunks: {len(chunks)}")

In [None]:
# Batch insert initialization
batch_size = 256

def embed_batch(batch_texts):
    return embedding_model.embed_documents(batch_texts)

batches = [(chunks[i:i+batch_size], metadatas[i:i+batch_size])
           for i in range(0, len(chunks), batch_size)]

In [None]:
# Parallel embedding and collection
embedded_batches = []
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = {executor.submit(embed_batch, texts): (texts, metas)
               for texts, metas in batches}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Embedding"):
        try:
            embeddings = future.result()
            texts, metas = futures[future]
            embedded_batches.append((texts, metas, embeddings))
        except Exception as e:
            print(f"[ERROR] Failed to embed batch: {e}")

In [None]:
# Clear previous DB
if os.path.exists("./chroma_db"):
    try:
        if 'vectorstore' in locals():
            del vectorstore
            import gc
            gc.collect()
        shutil.rmtree("./chroma_db")
        print("[INFO] Successfully removed previous Chroma DB.")
    except Exception as e:
        print(f"[WARN] Could not fully clean Chroma DB: {e}")
else:
    print("[INFO] No existing Chroma DB to remove.")


In [None]:
# Initialize Chroma
vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [None]:
# Helper to generate stable unique IDs per chunk
def get_chunk_id(meta):
    return f"{meta['filename'].replace('.json','')}_{meta['page']}_{meta['chunk_id']}"

In [None]:
# Index using Chroma's internal .upsert
for texts, metas, embeds in tqdm(embedded_batches, desc="Indexing into Chroma"):
    ids = [get_chunk_id(meta) for meta in metas]

    vectorstore._collection.upsert(
        ids=ids,
        documents=texts,
        embeddings=embeds,
        metadatas=metas
    )

print("[INFO] Finished indexing Chroma vector store.")

In [None]:
def load_eval_data(
    file_path: str, 
    randomize: bool = False, 
    limit: Optional[int] = None
) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if randomize:
        random.shuffle(data)

    if limit is not None:
        data = data[:limit]

    return data

In [None]:
def jaccard_similarity(a: str, b: str) -> float:
    a_tokens = set(a.lower().split())
    b_tokens = set(b.lower().split())

    if not a_tokens or not b_tokens:
        return 0.0
    
    intersection = a_tokens.intersection(b_tokens)
    union = a_tokens.union(b_tokens)
    
    return len(intersection) / len(union)

In [None]:
def is_relevant(ground_truth: str, doc_content: str, threshold: float = 50, jaccard_threshold: float = 0.3) -> bool:
    ground_truth = ground_truth.lower().strip()
    doc_content = doc_content.lower().strip()

    # Exact substring match
    if ground_truth in doc_content or doc_content in ground_truth:
        return True

    # Bi-directional fuzzy match
    similarity_1 = fuzz.partial_ratio(ground_truth, doc_content)
    similarity_2 = fuzz.partial_ratio(doc_content, ground_truth)
    if max(similarity_1, similarity_2) >= threshold:
        return True
    
    # Jaccard similarity
    jaccard = jaccard_similarity(ground_truth, doc_content)
    if jaccard >= jaccard_threshold:
        return True
    
    return False

In [None]:
def classify_query_with_gemma(question: str) -> dict:
    prompt = (
        f"Given the following question:\n\n\"{question}\"\n\n"
        "Classify this question with:\n"
        "Category: <a broad category in one word>\n"
        "Tags: <comma-separated 2 to 3 relevant keywords>"
    )
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(ollama_url, json=payload)
    if response.status_code == 200:
        output = response.json()["response"]
        category, tags = "unknown", []
        for line in output.strip().split("\n"):
            if line.lower().startswith("category:"):
                category = line.split(":", 1)[1].strip().strip("*")
            elif line.lower().startswith("tags:"):
                tags = [t.strip().strip("*") for t in line.split(":", 1)[1].split(",")]
        return {"category": category, "tags": tags}
    else:
        print(f"[ERROR] Failed to classify: {question}")
        return {"category": "unknown", "tags": []}

In [None]:
def tag_embedding_match(tags: List[str], doc_tags: List[str], threshold: float = 0.6) -> bool:
    if not tags or not doc_tags:
        return False

    query_embeds = [embedding_model.embed_query(tag) for tag in tags]
    doc_embeds = [embedding_model.embed_query(tag) for tag in doc_tags]

    sims = [cosine_similarity([q], [d])[0][0] for q in query_embeds for d in doc_embeds]
    return max(sims, default=0.0) >= threshold

In [None]:
def retrieve_by_metadata(query: str,
                         category: str,
                         tags: List[str],
                         top_k: int = 3,
                         verbose=False,
                         category_threshold: float = 0.6,
                         tag_sim_threshold: float = 0.6) -> List[Any]:

    category_embedding = embedding_model.embed_query(category)
    candidate_docs = vectorstore.similarity_search(query=query, k=50)

    filtered_docs = []
    seen = set()

    category_pass = 0
    tag_pass = 0
    total_skipped_no_meta = 0

    for doc in candidate_docs:
        doc_meta = doc.metadata or {}
        doc_cat = doc_meta.get("category", "").strip()
        if not doc_cat:
            total_skipped_no_meta += 1
            continue

        doc_cat_embedding = embedding_model.embed_query(doc_cat)
        cat_sim = cosine_similarity([category_embedding], [doc_cat_embedding])[0][0]
        if cat_sim < category_threshold:
            continue
        category_pass += 1

        doc_tags = doc_meta.get("tags", "")
        if isinstance(doc_tags, str):
            doc_tags = [t.strip() for t in doc_tags.split(",") if t.strip()]

        if not tag_embedding_match(tags, doc_tags, threshold=tag_sim_threshold):
            continue
        tag_pass += 1

        snippet = doc.page_content[:50].strip().lower()
        if snippet in seen:
            continue

        seen.add(snippet)
        filtered_docs.append(doc)
        if len(filtered_docs) >= top_k:
            break

    if verbose:
        print(f"\n[INFO] Retrieved: {len(candidate_docs)} Top: candidates for query: \"{query[:128]}...\"")
        print(f"[INFO] Skipped (missing metadata): {total_skipped_no_meta}")
        print(f"[INFO] Passed category filter: {category_pass}")
        print(f"[INFO] Passed tag filter: {tag_pass}")
        print(f"[INFO] Final top-k after deduplication: {len(filtered_docs)}\n")

    return filtered_docs

In [None]:
def evaluate_retriever(
    eval_data: list,
    reranker,
    k=3,
    fuzzy_threshold=70,
    jaccard_threshold=0.4,
    doc_content_key="page_content"
):
    hits = 0
    relevant_docs_total = 0
    retrieved_docs_total = 0
    reciprocal_ranks = []
    results_flat = []

    os.makedirs("results", exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%m-%d-%Y_%H-%M-%S")
    file_path = f"result_{timestamp}"
    csv_path = f"results/{file_path}.csv"

    tqdm_params = dict(
        desc="Evaluating",
        dynamic_ncols=True,
        file=sys.stdout,
        leave=True,
        mininterval=900.0   # 15 minutes
    )

    for idx, sample in enumerate(tqdm(eval_data, **tqdm_params)):
        question = sample["question"]
        ground_truth = sample["answer"]

        classification = classify_query_with_gemma(question)
        category = classification["category"]
        tags = classification["tags"]

        if not category or category.lower() == "unknown":
            print(f"[WARN] Question {idx+1} got weak category from Gemma: {question[:60]}...")

        try:
            initial_docs = retrieve_by_metadata(question, category, tags, top_k=k, verbose=False)
            if not initial_docs:
                reciprocal_ranks.append(0)
                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": 1,
                    "score": "N/A",
                    "found": False,
                    "doc_content": "[NO DOCUMENTS RETRIEVED]"
                })
                continue

            reranked = reranker(question, initial_docs, top_n=k)
            if not reranked:
                reciprocal_ranks.append(0)
                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": 1,
                    "score": "N/A",
                    "found": False,
                    "doc_content": "[NO DOCUMENTS RERANKED]"
                })
                continue

            found = False
            retrieved_docs_total += len(reranked)

            for rank, (score, doc) in enumerate(reranked):
                doc_content = getattr(doc, doc_content_key, doc)
                if isinstance(doc_content, dict):
                    doc_content = doc_content.get("content", "")

                if is_relevant(ground_truth, doc_content, fuzzy_threshold, jaccard_threshold):
                    relevant_docs_total += 1
                    if not found:
                        hits += 1
                        reciprocal_ranks.append(1 / (rank + 1))
                        found = True

                results_flat.append({
                    "item": idx + 1,
                    "query": question,
                    "ground_truth": ground_truth,
                    "category": category,
                    "tags": ", ".join(tags),
                    "rank": rank + 1,
                    "score": f"{score:.4f}",
                    "found": found,
                    "doc_content": doc_content[:500]
                })

            # If nothing relevant was found, log rank 1 again with found=False
            if not found:
                reciprocal_ranks.append(0)
                if reranked:
                    score, doc = reranked[0]
                    doc_content = getattr(doc, doc_content_key, doc)
                    if isinstance(doc_content, dict):
                        doc_content = doc_content.get("content", "")

                    results_flat.append({
                        "item": idx + 1,
                        "query": question,
                        "ground_truth": ground_truth,
                        "category": category,
                        "tags": ", ".join(tags),
                        "rank": 1,
                        "score": f"{score:.4f}",
                        "found": False,
                        "doc_content": doc_content[:500]
                    })

        except Exception as e:
            reciprocal_ranks.append(0)
            results_flat.append({
                "item": idx + 1,
                "query": question,
                "ground_truth": ground_truth,
                "category": category,
                "tags": ", ".join(tags),
                "rank": "error",
                "score": "N/A",
                "found": False,
                "doc_content": f"[ERROR] {str(e)}"
            })

    # Save output
    df = pd.DataFrame(results_flat)

    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"\n[INFO] Saved CSV results to: {csv_path}")

    df.to_html(f"html/{file_path}.html")
    print(f"[INFO] Saved HTML results to: html/{file_path}.html")

    total_queries = len(eval_data)
    metrics = {
        f"Recall@{k}": hits / total_queries if total_queries > 0 else 0.0,
        f"Precision@{k}": relevant_docs_total / retrieved_docs_total if retrieved_docs_total > 0 else 0.0,
        f"MRR@{k}": np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0
    }

    return metrics, file_path

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_with_cross_encoder(query, docs, top_n=3, verbose=False):
    pairs = [[query, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(pairs)
    normalized_scores = [score / (len(doc.page_content.split()) + 1) for score, doc in zip(scores, docs)]
    scored_docs = list(zip(normalized_scores, docs))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    if verbose:
        for i, (score, doc) in enumerate(scored_docs[:top_n], start=1):
            print(f"\nRank {i} Score: {score:.4f}")
            print(doc.page_content[:300] + "...")
            print("-" * 60)

    return scored_docs[:top_n] 

In [None]:
eval_data = load_eval_data("evaluation.json", randomize=False, limit=750)

results, file_path = evaluate_retriever(
    eval_data=eval_data,
    reranker=rerank_with_cross_encoder,
    k=3,
    fuzzy_threshold=60,
    jaccard_threshold = 0.4,
    doc_content_key='page_content'
)

In [None]:
print("\nEvaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")

# View Data Frame

In [None]:
df = pd.read_csv(f"results/{file_path}.csv")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

df.head()

# SINGLE QUERIES

In [None]:
# Test Output
query = "What is the most famous Filipino dish?"

In [None]:
# Classify with Gemma
classification = classify_query_with_gemma(query)
category = classification["category"]
tags = classification["tags"]

print(f"[INFO] Category: {category}")
print(f"[INFO] Tags: {tags}")

In [None]:
# Retrieve documents by category and tags
initial_docs = retrieve_by_metadata(query, category, tags, top_k=10, verbose=True)

if not initial_docs:
    print("[WARN] No documents found after filtering.")
else:
    reranked = rerank_with_cross_encoder(query, initial_docs, top_n=3)

In [None]:
# Print output
for i, (score, doc) in enumerate(reranked, start=1):
    print(f"\nRank {i} | Score: {score:.4f}")
    print("-" * 60)
    print(doc.page_content.strip()[:300])
    print("-" * 60)