In [None]:
import numpy as np

from typing import List, Dict, Any

from rank_bm25 import BM25Okapi

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document


In [58]:
# Embedding model
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
# LLM model
llm = OllamaLLM(model="llama3")

In [59]:
path = "../data/Understanding_Climate_Change.pdf"

Encode the pdf to vector store and return split document from the step before to create BM25 instance

In [60]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents
    
def encode_pdf_and_get_split_documents(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    vectorstore = FAISS.from_documents(cleaned_texts, embedding_model)

    return vectorstore, cleaned_texts

In [61]:
vectorstore, cleaned_texts = encode_pdf_and_get_split_documents(path)

Create a bm25 index for retrieving documents by keywords

In [65]:
def create_bm25_index(documents: List[Document]) -> BM25Okapi:
    """
    Create a BM25 index from the given documents.

    BM25 (Best Matching 25) is a ranking function used in information retrieval.
    It's based on the probabilistic retrieval framework and is an improvement over TF-IDF.

    Args:
    documents (List[Document]): List of documents to index.

    Returns:
    BM25Okapi: An index that can be used for BM25 scoring.
    """
    # Tokenize each document by splitting on whitespace
    # This is a simple approach and could be improved with more sophisticated tokenization
    tokenized_docs = [doc.page_content.split() for doc in documents]
    return BM25Okapi(tokenized_docs) 

In [66]:
bm25 = create_bm25_index(cleaned_texts)

## 2. Why Inverse Normalization?

If your vector retriever outputs **distances**:

\[
distance(q, d) = 0 \quad \text{(perfect match)}, \quad \text{larger = worse}
\]

---

### Normalization

We first normalize distances into the range \([0,1]\):

\[
x' = \frac{x - \min(x)}{\max(x) - \min(x)} \in [0,1]
\]

But this creates a problem:

- **Closest docs (good ones)** → 0  
- **Farthest docs (bad ones)** → 1  

That’s the **opposite** of what we want.

---

### Inverse Normalization

To fix this, we flip the normalized values:

\[
x'' = 1 - x' = 1 - \frac{x - \min(x)}{\max(x) - \min(x)}
\]

Now:

- **Closest docs** → 1 (good ✅)  
- **Farthest docs** → 0 (bad ❌)

---

### ✅ Why?

This makes scores consistent with BM25 or cosine similarity,  
where **higher values = better relevance**.


In [93]:
def fusion_retrieval(vectorstore, bm25, query, k =5, alpha = 0.5):
    epsilon = 1e-8

    # Step 1: Get all documents from the vectorstore
    all_docs = vectorstore.similarity_search("", k=vectorstore.index.ntotal)

    # Step 2: Perform BM25 search
    bm25_scores = bm25.get_scores(query.split())

    # Step 3: Perform vector search
    vector_results = vectorstore.similarity_search_with_score(query, k=len(all_docs))

    # Step 4: Normalize scores
    vector_scores = np.array([score for _, score in vector_results])
    vector_scores = 1 - (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon) # Inverse

    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) -  np.min(bm25_scores) + epsilon)

    # Step 5: Combine scores
    combined_scores = alpha * vector_scores + (1 - alpha) * bm25_scores 

    # Step 6: Rank documents
    sorted_indices = np.argsort(combined_scores)[::-1]

    # Step 7: Return top k documents
    return [all_docs[i] for i in sorted_indices[:k]]

In [95]:
# Query
query = "What are the impacts of climate change on the environment?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)
docs_content = [doc.page_content for doc in top_docs]
docs_content

['rehabilitation. Engaging local communities in restoration projects ensures sustainability and \nlong-term success. \nEx-Situ Conservation \nEx-situ conservation involves protecting species outside their natural habitats, such as in \nzoos, botanical gardens, and seed banks. These efforts can preserve genetic diversity, support \nbreeding programs, and facilitate reintroduction into the wild. Ex-situ conservation \ncomplements in-situ efforts. \nIntegrating Biodiversity and Climate Action \nNature-Based Solutions \nNature-based solutions leverage natural processes to address climate change while enhancing \nbiodiversity. Examples include reforestation, mangrove restoration, and sustainable \nagriculture. These solutions provide multiple benefits, such as carbon sequestration, flood \nprotection, and habitat creation. \nClimate-Resilient Conservation \nConservation strategies must account for climate change impacts to be effective. This',
 'trust and achieving policy objectives. \nInte