<div style="text-align: center;">

<img src="../images/context_enrichment_window.svg" alt="context enrichment window" style="width:70%; height:auto;">
</div>

### Import libraries and environment variables

In [None]:
import os
import sys
from dotenv import load_dotenv
from langchain.docstore.document import Document


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks
from helper_functions import *
from evaluation.evalute_rag import *

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

### Define path to PDF

In [2]:
path = "../data/Understanding_Climate_Change.pdf"

### Read PDF to string

In [3]:
content = read_pdf_to_string(path)

### Function to split text into chunks with metadata of the chunk chronological index

In [4]:
def split_text_to_chunks_with_indices(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(Document(page_content=chunk, metadata={"index": len(chunks), "text": text}))
        start += chunk_size - chunk_overlap
    return chunks

### Split our document accordingly

In [5]:
chunks_size = 200
chunk_overlap = 100
docs = split_text_to_chunks_with_indices(content, chunks_size, chunk_overlap)

### Create vector store and retriever

In [6]:
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

### Function to draw the k<sup>th</sup> chunk (in the original order) from the vector store 


In [7]:
def get_chunk_by_index(vectorstore, target_index: int):
    """
    Retrieve a chunk from the vectorstore based on its index in the metadata.
    
    Args:
    vectorstore (VectorStore): The vectorstore containing the chunks.
    target_index (int): The index of the chunk to retrieve.
    debug (bool): If True, print debug information.
    
    Returns:
    Optional[Document]: The retrieved chunk as a Document object, or None if not found.
    """
    # Retrieve all documents from the vectorstore
    all_docs = vectorstore.similarity_search("", k=vectorstore.index.ntotal)
    
    # Search for the document with the matching index
    for doc in all_docs:
        # Check if 'index' is in metadata and matches the target index
        if 'index' in doc.metadata and doc.metadata['index'] == target_index:
            return doc
    
    # If we've gone through all documents and haven't found a match, return None
    return None

### Check the function

In [8]:
chunk = get_chunk_by_index(vectorstore, 0)
print(chunk.page_content)

Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the plane


### Function that retrieves from the vector stroe based on semantic similarity and then pads each retrieved chunk with its num_neighbors before and after, taking into account the chunk overlap to construct a meaningful wide window arround it

In [9]:
def retrieve_with_context_overlap(vectorstore, query: str, k: int = 2, num_neighbors: int = 2, chunk_size: int = 200, chunk_overlap: int = 100) -> List[str]:
    """
    Retrieve chunks based on a query, then fetch neighboring chunks and concatenate them, 
    accounting for overlap and correct indexing.

    Args:
    vectorstore (VectorStore): The vectorstore containing the chunks.
    query (str): The query to search for relevant chunks.
    k (int): The number of relevant chunks to retrieve.
    num_neighbors (int): The number of chunks to retrieve before and after each relevant chunk.
    chunk_size (int): The size of each chunk when originally split.
    chunk_overlap (int): The overlap between chunks when originally split.

    Returns:
    List[str]: List of concatenated chunk sequences, each centered on a relevant chunk.
    """
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
    relevant_chunks = retriever.get_relevant_documents(query)

    result_sequences = []

    for chunk in relevant_chunks:
        current_index = chunk.metadata.get('index')
        if current_index is None:
            continue

        # Collect neighboring chunks
        neighbor_chunks = [chunk]  # Include the current chunk
        for i in range(1, num_neighbors + 1):
            prev_chunk = get_chunk_by_index(vectorstore, current_index - i)
            if prev_chunk:
                neighbor_chunks.insert(0, prev_chunk)
            next_chunk = get_chunk_by_index(vectorstore, current_index + i)
            if next_chunk:
                neighbor_chunks.append(next_chunk)

        # Sort chunks by their index
        neighbor_chunks.sort(key=lambda x: x.metadata.get('index', 0))

        # Concatenate chunks accounting for overlap
        concatenated_text = neighbor_chunks[0].page_content
        for i in range(1, len(neighbor_chunks)):
            current_chunk = neighbor_chunks[i].page_content
            overlap_start = chunk_size - chunk_overlap
            concatenated_text += current_chunk[overlap_start:]

        result_sequences.append(concatenated_text)

    return result_sequences

### Comparing regular retrival and retrival with context window

In [None]:
query = "climate change"
context = chunks_query_retriever.get_relevant_documents(query)
context_pages_content = [doc.page_content for doc in context]

print("regular retrieval:\n")
show_context(context_pages_content)

sequences = retrieve_with_context_overlap(vectorstore, query)
print("retrieval with context overlap:\n")
show_context(sequences)