In [None]:
import os
import sys
from dotenv import load_dotenv
from langchain_community.docstore.document import Document

from typing import List
from rank_bm25 import BM25Okapi
import numpy as np


from utils.helper_functions import *
from utils.evaluate_rag import *

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [2]:
path = "data/Climate_Change.pdf"

In [None]:
from utils.helper_functions import replace_t_with_space
def encode_pdf_and_get_split_documents(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF file and returns a list of split documents.
    
    Args:
        path (str): Path to the PDF file.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks.
    
    Returns:
        A tuple of (FAISS vector store, cleaned text documents).
    """
    
    from langchain_community.document_loaders import PyPDFLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    from langchain_openai import OpenAIEmbeddings
    from langchain_community.vectorstores import FAISS

    #Load the pdf
    loader = PyPDFLoader(path)
    documents = loader.load()

    #split the document
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len)
    split_documents = text_splitter.split_documents(documents)
    cleaned_text = replace_t_with_space(split_documents)

    #create embadding and vectore store
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_documents(cleaned_text, embeddings)

    return vector_store, cleaned_text
    

In [5]:
vectorstore, cleaned_text = encode_pdf_and_get_split_documents(path)

### Create a bm25 index for retrieving documents by keywords

In [8]:
tokenized_corpus

[['Hello', 'there', 'good', 'man!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['How', 'is', 'the', 'weather', 'today?']]

In [7]:
from rank_bm25 import BM25Okapi
#Initializing
corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# Ranking of documents
query = "windy london"
tokenized_query = query.split(" ")

scores = bm25.get_scores(tokenized_query)
print(scores)

doc_top_n = bm25.get_top_n(tokenized_query, corpus, n=1)
print(doc_top_n)

[0.         0.46864736 0.        ]
['It is quite windy in London']


In [16]:
def create_bm25_index(documents: List[Document]) -> BM25Okapi:
    """
    Create a BM25 index from the given documents.

    BM25 (Best Matching 25) is a ranking function used in information retrieval.
    It's based on the probabilistic retrieval framework and is an improvement over TF-IDF.

    Args:
    documents (List[Document]): List of documents to index.

    Returns:
    BM25Okapi: An index that can be used for BM25 scoring.
    """
    # Tokenize each document by splitting on whitespace
    # This is a simple approach and could be improved with more sophisticated tokenization
    tokenized_docs = [
        doc.page_content.split() for doc in documents
    ]
    return BM25Okapi(tokenized_docs)

In [17]:
bm25 = create_bm25_index(cleaned_text) # Create BM25 index from the cleaned texts (chunks)

In [18]:
def fusion_retrieval(vectorstore, bm25, query: str, k: int = 5, alpha: float = 0.5) -> List[Document]:
    """
    Perform fusion retrieval combining keyword-based (BM25) and vector-based search.

    Args:
    vectorstore (VectorStore): The vectorstore containing the documents.
    bm25 (BM25Okapi): Pre-computed BM25 index.
    query (str): The query string.
    k (int): The number of documents to retrieve.
    alpha (float): The weight for vector search scores (1-alpha will be the weight for BM25 scores).

    Returns:
    List[Document]: The top k documents based on the combined scores.
    """
    
    epsilon = 1e-8

    # Step 1: Get all documents from the vectorstore
    all_docs = vectorstore.similarity_search("", k=vectorstore.index.ntotal)

    # Step 2: Perform BM25 search
    bm25_scores = bm25.get_scores(query.split())

    # Step 3: Perform vector search
    vector_results = vectorstore.similarity_search_with_score(query, k=len(all_docs))
    
    # Step 4: Normalize scores
    vector_scores = np.array([score for _, score in vector_results])
    vector_scores = 1 - (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon)

    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) -  np.min(bm25_scores) + epsilon)

    # Step 5: Combine scores
    combined_scores = alpha * vector_scores + (1 - alpha) * bm25_scores  

    # Step 6: Rank documents
    sorted_indices = np.argsort(combined_scores)[::-1]
    
    # Step 7: Return top k documents
    return [all_docs[i] for i in sorted_indices[:k]]

In [19]:
# Query
query = "What are the impacts of climate change on the environment?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.2)
docs_content = [doc.page_content for doc in top_docs]
show_context(docs_content)

Context 1:
for the new market mechanism established in Durban. 
 
COP 19 
19th session of the Conference 
of the Parties(COP 19) to 
the UNFCCC and the 9th session 
of the CMP 9 to the Kyoto 
Protocol was held 
 in Warsaw , Poland in 2013 
• The Warsaw conference agreed a time plan for countries to table 
their contributions to reducing or limiting greenhouse gas emissions 
under the new global climate agreement in 2015.  
• It also agreed ways to accelerate efforts to deepen emission cuts over 
the rest of this decade, and to set up a mechanism to address losses 
and damage caused by climate change in vulnerable developing 
countries. 
• The conference agreed decisions which enhance the implementation 
of a range of measures already agreed, including climate finance, 
REDD+, and transparency of reporting on emissions.


Context 2:
COP 17 
17th session of the Conference 
of the Parties (COP 17) to 
the UNFCCC and the 7th session 
of the CMP 7 to the Kyoto 
Protocol was held in Durban ,

In [22]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_classic.retrievers.ensemble import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

#load pdf
loader = PyPDFLoader('data/Climate_change.pdf')
document = loader.load()

#split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50
)
splits = text_splitter.split_documents(document)

# dense retriever (sementic)
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(splits, embedding)
dense_retriever = vectorstore.as_retriever(search_kwargs={"k":3})

#sparse retriever (keyword - BM25)
sparse_retriever = BM25Retriever.from_documents(splits)
sparse_retriever.k = 3

alpha: float = 0.5

# Hybrid retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[dense_retriever, sparse_retriever],
    alpha=alpha,
    weights=[alpha, 1 - alpha]
)

# LLm
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Create. RAG chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=ensemble_retriever,
    return_source_documents=True,
    chain_type="stuff"
)

# Query with parameter
query = "What is the main topic of this document?"
result = rag_chain.invoke({"query": query})

print("Answer:", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
    print(f"- {doc.page_content[:200]}...")



Answer: The main topic of the document is related to climate change, specifically discussing the principles and agreements of the UN Conventions, the differentiation between developed and developing countries in addressing climate change, and the importance of greenhouse gases, particularly carbon dioxide. It also mentions the need for mechanisms to address climate change impacts on vulnerable developing countries.

Sources:
- other provisions of the UN Conventions. 
/square4 The ‘developing versus developed country’ schism needs to be diluted at the earliest and Developed 
Countries should avoid watering down the CBDR prin...
- the Convention, applicable to all Parties. The ADP is to complete its work as early as possible, but no 
later than 2015, in order to adopt this protocol, legal instrument or agreed outcome with legal...
- the earth’s surface. 
The main greenhouse gases include: 
/square4 Water vapour:  It is the most abundant greenhouse gas 
(GHG), however it spends just a sho