In [104]:
import pandas as pd
import numpy as np
import wikipedia
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch

In [105]:
# Wikipedia Data Scraping (Tagalog)
wikipedia.set_lang("tl")
try:
    text = wikipedia.page("Kultura ng Pilipinas").content
except wikipedia.exceptions.DisambiguationError as e:
    print(f"Disambiguation error. Options: {e.options}")
except Exception as e:
    print(f"Error: {e}")

In [106]:
# Chunking (Recursive)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

chunks = splitter.split_text(text)

In [107]:
# Dense Embeddings (BGE-M3)
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

In [108]:
# Chroma Vector Store (Dense Retrieval)
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name="filipino_culture"
)

In [109]:
# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(chunks)
bm25_retriever.k = 3

In [110]:
#  DPR Setup (Dense Retriever #2)
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

In [111]:
# Embed all chunks for DPR
def encode_dpr_context(texts):
    inputs = ctx_tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        return ctx_encoder(**inputs).pooler_output.numpy()

In [112]:
# Store DPR embeddings + docs in a simple local structure
dpr_embeddings = encode_dpr_context(chunks)
dpr_docs = chunks

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [113]:
# DPR search function
def dpr_retrieve(query, k=2):
    inputs = q_tokenizer(query, return_tensors="pt", truncation=True)
    with torch.no_grad():
        q_emb = q_encoder(**inputs).pooler_output.numpy()

    similarities = np.dot(dpr_embeddings, q_emb.T).squeeze()
    top_indices = similarities.argsort()[::-1][:k]
    return [dpr_docs[i] for i in top_indices]

In [114]:
from langchain_core.documents import Document
from langchain_core.runnables import Runnable


class DPRRetriever(Runnable):
    def __init__(self, docs, embeddings, query_fn):
        self.docs = docs
        self.embeddings = embeddings
        self.query_fn = query_fn

    def get_relevant_documents(self, query):
        return [Document(page_content=doc) for doc in self.query_fn(query)]

    async def aget_relevant_documents(self, query):
        return self.get_relevant_documents(query)

    def invoke(self, query, config=None):
        return self.get_relevant_documents(query)
    
dpr_retriever = DPRRetriever(dpr_docs, dpr_embeddings, dpr_retrieve)

In [115]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        vectorstore.as_retriever(search_kwargs={"k": 3}),
        bm25_retriever,
        dpr_retriever
    ],
    weights=[0.5, 0.2, 0.3]
)

In [119]:
# Test Output
query = "Ano-ano ang mga ulam na dinala ng mga Kastila sa Pilipinas?"
results = ensemble_retriever.get_relevant_documents(query)

for i, result in enumerate(results[:3], start=1):
    print(f"\nResult {i}")
    print("-" * 60)
    print(result.page_content.strip()[:350])  # First 350 characters
    print("-" * 60)


Result 1
------------------------------------------------------------
Kabilang sa mga ibang sikat na ulam na may impluwensyang Timog-silangang Asyano at Kastila ang apritada, asado, chorizo, empanada, mani, paksiw, pandesal, pescado frito (pinritong isda), sisig, torta, kare-kare, kilawen, pinakbet, pinapaitan, at sinigang. Waring di-nakagaganang kainin sa paletang Kanluranin ang mga ilang kinakain ng mga Pilipino tu
------------------------------------------------------------

Result 2
------------------------------------------------------------
. Maaaring pag-uriin ang mga pista bilang mga Misa, prusisyon, parada, dulaan, seremonyang panrelihiyon o pangkultural, pakikipagkalakalan, eksibit, konsiyerto, paringal at iba't ibang laro at paligsahan.
------------------------------------------------------------

Result 3
------------------------------------------------------------
Pakikisama: Ang pakikisama ay ang kaugaliang Pilipino na nagnanais magkaroon ng maganda at mabuting pakikitun