In [4]:
import os
import sys
from dotenv import load_dotenv
from langchain.docstore.document import Document

from typing import List
from rank_bm25 import BM25Okapi
import numpy as np

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings

load_dotenv()

True

In [5]:
base_embeddings = CohereEmbeddings(
    model="embed-english-light-v3.0"
)

In [3]:
path = "data/Understanding_Climate_Change.pdf"

In [6]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

In [7]:
# Load PDF documents
loader = PyPDFLoader(path)
documents = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_documents(documents)
cleaned_texts = replace_t_with_space(texts)

# Create embeddings and vector store
vectorstore = FAISS.from_documents(cleaned_texts, base_embeddings)

In [8]:
tokenized_docs = [doc.page_content.split() for doc in cleaned_texts]
bm25 = BM25Okapi(tokenized_docs)

In [9]:
def fusion_retrieval(vectorstore, bm25, query: str, k: int = 3, alpha: float = 0.5) -> List[Document]:
    """
    Perform fusion retrieval combining keyword-based (BM25) and vector-based search.

    Args:
    vectorstore (VectorStore): The vectorstore containing the documents.
    bm25 (BM25Okapi): Pre-computed BM25 index.
    query (str): The query string.
    k (int): The number of documents to retrieve.
    alpha (float): The weight for vector search scores (1-alpha will be the weight for BM25 scores).

    Returns:
    List[Document]: The top k documents based on the combined scores.
    """
    
    epsilon = 1e-8

    # Step 1: Get all documents from the vectorstore
    all_docs = vectorstore.similarity_search("", k=vectorstore.index.ntotal)

    # Step 2: Perform BM25 search
    bm25_scores = bm25.get_scores(query.split())

    # Step 3: Perform vector search
    vector_results = vectorstore.similarity_search_with_score(query, k=len(all_docs))
    
    # Step 4: Normalize scores
    vector_scores = np.array([score for _, score in vector_results])
    vector_scores = 1 - (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon)

    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) -  np.min(bm25_scores) + epsilon)

    # Step 5: Combine scores
    combined_scores = alpha * vector_scores + (1 - alpha) * bm25_scores  

    # Step 6: Rank documents
    sorted_indices = np.argsort(combined_scores)[::-1]
    
    # Step 7: Return top k documents
    return [all_docs[i] for i in sorted_indices[:k]]

In [10]:
# Query
query = "What are the impacts of climate change on the environment?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=3, alpha=0.5)
docs_content = [doc.page_content for doc in top_docs]

for i, c in enumerate(docs_content):
    print(f"Context {i + 1}:")
    print(c)
    print("\n")

Context 1:
sustainability. 
Policy and Governance 
Effective policies and governance are crucial for driving climate action. This includes setting 
ambitious targets, implementing robust regulations, and ensuring accountability. Participatory 
governance and inclusive decision-making processes enhance policy effectiveness and public 
trust. 
Commitment to Future Generations 
Intergenerational Equity


Context 2:
Ruminant animals, such as cows and sheep, produce methane during digestion. Manure 
management practices also contribute to methane and nitrous oxide emissions. Innovations in 
livestock feeding and waste management can help mitigate these emissions. 
Rice Cultivation 
Flooded rice paddies create anaerobic conditions that lead to methane production. Improved 
water management and rice varieties can help reduce these emissions. Research into 
sustainable rice farming practices is crucial for balancing food security and climate goals. 
Fertilizers 
The use of synthetic fertilizer

In [15]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

bm25_retriever = BM25Retriever.from_documents(cleaned_texts)
ensemble_retriever = EnsembleRetriever(
    retrievers = [bm25_retriever, vectorstore.as_retriever()],
    weights = [0.5, 0.5]
)
results = ensemble_retriever.invoke(query)
docs_content = [doc.page_content for doc in results]

for i, c in enumerate(docs_content):
    print(f"Context {i + 1}:")
    print(c)
    print("\n")

Context 1:
Tropical rainforests are particularly important for carbon storage. Deforestation in the 
Amazon, Congo Basin, and Southeast Asia has significant impacts on global carbon cycles 
and biodiversity. These regions are often cleared for agriculture, logging, and mining, leading 
to habitat loss and species extinction. 
Boreal Forests 
Boreal forests, found in the northern regions of North America, Europe, and Asia, also play a 
crucial role in sequestering carbon. Logging and land-use changes in these regions contribute 
to climate change. These forests are vital for regulating the Earth's climate and supporting 
indigenous communities and wildlife. 
Agriculture 
Agriculture contributes to climate change through methane emissions from livestock, rice 
paddies, and the use of synthetic fertilizers. Methane is a potent greenhouse gas with a much 
higher heat-trapping capability than CO2, albeit in smaller quantities. 
Livestock Emissions


Context 2:
development of eco-friendly fe