In [1]:
!pip install faiss-cpu futures langchain-community python-dotenv tqdm

Collecting futures
  Downloading futures-3.0.5.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: futures
[33m  DEPRECATION: Building 'futures' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'futures'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for futures (setup.py) ... [?25ldone
[?25h  Created wheel for futures: filename=futures-3.0.5-py3-none-any.whl size=14144 sha256=2a7630bc3143caa0cab8b5faabe6cbfe56481b557779d2270c99490e4ac26673
  Stored in directory: /Users/softaims/Library/Caches/pip/wheels/c0/65/47/17d11231c90639e4f76ee5e86b7cbd607b97f11d7677789787
Successfully built futures
In

In [2]:
import os
import sys
import faiss
from tqdm import tqdm
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.docstore.in_memory import InMemoryDocstore


# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable (comment out if not using OpenAI)
if not os.getenv('OPENAI_API_KEY'):
    os.environ["OPENAI_API_KEY"] = input("Please enter your OpenAI API key: ")
else:
    os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

# Original path append replaced for Colab compatibility
from helper_functions import *
from evaluation.evalute_rag import *

In [3]:
PATH = "data/Understanding_Climate_Change.pdf"
LANGUAGE_MODEL_NAME = "gpt-4o-mini"
EMBEDDING_MODEL_NAME = "text-embedding-3-small"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [4]:
def generate_hypothetical_prompt_embeddings(chunk_text: str):
    """
    Uses the LLM to generate multiple hypothetical questions for a single chunk.
    These questions will be used as 'proxies' for the chunk during retrieval.

    Parameters:
    chunk_text (str): Text contents of the chunk

    Returns:
    chunk_text (str): Text contents of the chunk. This is done to make the 
        multithreading easier
    hypothetical prompt embeddings (List[float]): A list of embedding vectors
        generated from the questions
    """
    llm = ChatOpenAI(temperature=0, model_name=LANGUAGE_MODEL_NAME)
    embedding_model = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

    question_gen_prompt = PromptTemplate.from_template(
        "Analyze the input text and generate essential questions that, when answered, \
        capture the main points of the text. Each question should be one line, \
        without numbering or prefixes.\n\n \
        Text:\n{chunk_text}\n\nQuestions:\n"
    )
    question_chain = question_gen_prompt | llm | StrOutputParser()

    # parse questions from response
    # Notes: 
    # - gpt4o likes to split questions by \n\n so we remove one \n
    # - for production or if using smaller models from ollama, it's beneficial to use regex to parse 
    # things like (un)ordeed lists
    # r"^\s*[\-\*\•]|\s*\d+\.\s*|\s*[a-zA-Z]\)\s*|\s*\(\d+\)\s*|\s*\([a-zA-Z]\)\s*|\s*\([ivxlcdm]+\)\s*"
    questions = question_chain.invoke({"chunk_text": chunk_text}).replace("\n\n", "\n").split("\n")
    
    return chunk_text, embedding_model.embed_documents(questions)

In [6]:
def prepare_vector_store(chunks: List[str]):
    """
    Creates and populates a FAISS vector store from a list of text chunks.

    This function processes a list of text chunks in parallel, generating 
    hypothetical prompt embeddings for each chunk.
    The embeddings are stored in a FAISS index for efficient similarity search.

    Parameters:
    chunks (List[str]): A list of text chunks to be embedded and stored.

    Returns:
    FAISS: A FAISS vector store containing the embedded text chunks.
    """

    # Wait with initialization to see vector lengths
    vector_store = None  

    with ThreadPoolExecutor() as pool:  
        # Use threading to speed up generation of prompt embeddings
        futures = [pool.submit(generate_hypothetical_prompt_embeddings, c) for c in chunks]
        
        # Process embeddings as they complete
        for f in tqdm(as_completed(futures), total=len(chunks)):  
            
            chunk, vectors = f.result()  # Retrieve the processed chunk and its embeddings
            
            # Initialize the FAISS vector store on the first chunk
            if vector_store == None:  
                vector_store = FAISS(
                    embedding_function=OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME),  # Define embedding model
                    index=faiss.IndexFlatL2(len(vectors[0])),  # Define an L2 index for similarity search
                    docstore=InMemoryDocstore(),  # Use in-memory document storage
                    index_to_docstore_id={}  # Maintain index-to-document mapping
                )
            
            # Pair the chunk's content with each generated embedding vector.
            # Each chunk is inserted multiple times, once for each prompt vector
            chunks_with_embedding_vectors = [(chunk.page_content, vec) for vec in vectors]
            
            # Add embeddings to the store
            vector_store.add_embeddings(chunks_with_embedding_vectors)  

    return vector_store  # Return the populated vector store


In [7]:
def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    vectorstore = prepare_vector_store(cleaned_texts)

    return vectorstore

In [8]:
# Chunk size can be quite large with HyPE as we are not loosing percision with more
# information. For production, test how exhaustive your model is in generating sufficient 
# amount of questions per chunk. This will mostly depend on your information density.
chunks_vector_store = encode_pdf(PATH, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [00:50<00:00,  1.93it/s]


In [9]:
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 3})

In [12]:
test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question(test_query, chunks_query_retriever)
context = list(set(context))
show_context(context)

Context 1:
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which 
began at the end of the last ice age, human societies flourished, but the industrial era has seen 
unprecedented changes. 
Modern Observations 
Modern scientific observations indicate a rapid increase in global temperatures, sea levels, 
and extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has 
documented these changes extensively. Ice core samples, tree rings, and ocean sediments 
provide a historical record that scientists use to understand past climate conditions and 
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases i