In [16]:
import os
from dotenv import load_dotenv

from pydantic import BaseModel, Field
from typing import List
from langchain.output_parsers import PydanticOutputParser

# LangChain components
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import cohere

load_dotenv()

True

In [3]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    timeout=None,
    max_retries=2,
    # other params...
)

base_embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [4]:
annual_report_text = 'RAG_TECHNIQUES/data/nike_2023_annual_report.txt'

loader = TextLoader(annual_report_text)
docs = loader.load()

In [5]:
class Header(BaseModel):
    """The single-sentence summary of a text chunk."""
    header: str = Field(
        description="A single-sentence summary of the main topics and concepts in the text."
    )

# Create the parser
header_parser = PydanticOutputParser(pydantic_object=Header)

# Create the prompt
header_prompt = PromptTemplate(
    template="""You are an expert at summarizing technical documents. Your task is to generate a concise, single-sentence summary of the following text. This summary will be used as a contextual header for smaller text chunks during a vector search. Focus on the main topics and key entities.

Text:
{text}

{format_instructions}
""",
    input_variables=["text"],
    partial_variables={"format_instructions": header_parser.get_format_instructions()}
)

# Create the chain
header_generation_chain = header_prompt | llm | header_parser

In [6]:
# Create the parent splitter
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000, chunk_overlap=200
)

# Create the child splitter
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=0
)

# Split the document into parent and child chunks
parent_chunks = parent_splitter.split_documents(docs)
child_chunks = child_splitter.split_documents(docs)

print(f"Created {len(parent_chunks)} parent chunks and {len(child_chunks)} child chunks.")


Created 48 parent chunks and 222 child chunks.


In [7]:
contextual_chunks = []
processed_child_indices = set() # To avoid processing a child chunk more than once

# Iterate through each parent chunk
for i, parent_chunk in enumerate(parent_chunks):
    print(f"Processing parent chunk {i+1}/{len(parent_chunks)}...")
    
    # --- Best Practice: Robust Error Handling ---
    try:
        # Generate the header
        header = header_generation_chain.invoke({"text": parent_chunk.page_content})
        print(f"  Generated Header: {header.header}")
        
        # Find all child chunks contained within this parent chunk
        for j, child_chunk in enumerate(child_chunks):
            if j not in processed_child_indices and child_chunk.page_content in parent_chunk.page_content:
                # Prepend the header to the child chunk's content
                new_content = f"CONTEXT: {header.header}\n\n---\n\n{child_chunk.page_content}"
                
                # Create a new Document object with the modified content
                contextual_doc = Document(page_content=new_content, metadata=child_chunk.metadata)
                contextual_chunks.append(contextual_doc)
                processed_child_indices.add(j)

    except Exception as e:
        print(f"  Error processing parent chunk {i+1}: {e}")

print(f"\nCreated {len(contextual_chunks)} contextual chunks.")

Processing parent chunk 1/48...
  Generated Header: NIKE, Inc.'s Form 10-K annual report for the fiscal year ended May 31, 2023, details the company's global business as the largest seller of athletic footwear, apparel, equipment, and services.
Processing parent chunk 2/48...
  Generated Header: NIKE, Inc. designs, markets, and sells a diverse range of athletic and casual footwear, apparel, equipment, and digital services globally under its NIKE, Jordan, and Converse brands, with 57% of FY23 revenues from international markets, distributed through wholesale and direct-to-consumer channels, emphasizing innovation.
Processing parent chunk 3/48...
  Generated Header: Nike's product development integrates diverse specialists and athlete feedback to innovate technologies, while its global manufacturing relies on a vast network of international contract manufacturers and suppliers, navigating complex international trade policies and a highly competitive market.
Processing parent chunk 4/48..

In [8]:
print("--- EXAMPLE: Before (Naive Chunk) ---")
print(child_chunks[2].page_content)
print("\n" + "="*80 + "\n")
print("--- EXAMPLE: After (Contextual Chunk) ---")
# Find the corresponding contextual chunk
# Note: The index might not match exactly due to processing order, so we search
corresponding_contextual_chunk = next(c for c in contextual_chunks if child_chunks[2].page_content in c.page_content)
print(corresponding_contextual_chunk.page_content)

--- EXAMPLE: Before (Naive Chunk) ---
the registered public accounting firm that prepared or issued its audit report.þ
•if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant 
included in the filing reflect the correction of an error to previously issued financial statements. ¨
•whether any of those error corrections are restatements that required a recovery analysis of incentive-based 
compensation received by any of the registrant's executive officers during the relevant recovery period pursuant to 
§ 240.10D-1(b). ¨
•whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). ☐ þ
As of November 30, 2022, the aggregate market values of the Registrant's Common Stock held by non-affiliates were:
Class A $ 7,831,564,572 
Class B 136,467,702,472 
$ 144,299,267,044 As of July 12, 2023, the number of shares of the Registrant's Common Stock outstanding were:
Class A  304,897,252 
Class B  1,225,074,356 
 1,5

In [11]:
import time
from typing import List
from langchain_core.documents import Document

def create_vectorstore_in_batches(
    documents: List[Document], 
    embedding_model, 
    batch_size: int = 32, 
    delay_seconds: float = 1.0
):
    """
    Creates a FAISS vector store by processing documents in batches with a delay
    to respect API rate limits.
    """
    vectorstore = None
    
    # Iterate over the documents in batches
    for i in range(0, len(documents), batch_size):
        # Get the current batch of documents
        batch = documents[i:i+batch_size]
        
        print(f"  Processing batch {i//batch_size + 1}/{(len(documents) + batch_size - 1)//batch_size}...")
        
        if vectorstore is None:
            # For the first batch, create the vector store
            vectorstore = FAISS.from_documents(documents=batch, embedding=embedding_model)
        else:
            # For subsequent batches, add to the existing vector store
            vectorstore.add_documents(documents=batch)
            
        # --- THIS IS THE CRUCIAL PART ---
        # Add a delay to stay under the rate limit
        print(f"  Waiting for {delay_seconds} second(s) to respect rate limits...")
        time.sleep(delay_seconds)
        
    return vectorstore

In [13]:
SAFE_BATCH_SIZE = 8
SAFE_DELAY_SECONDS = 8 # A little extra buffer

# Create the naive vector store using our new safe parameters
print("Creating naive vector store in batches (TPM-aware)...")
vectorstore_naive = create_vectorstore_in_batches(
    documents=child_chunks, 
    embedding_model=base_embeddings,
    batch_size=SAFE_BATCH_SIZE,
    delay_seconds=SAFE_DELAY_SECONDS
)
retriever_naive = vectorstore_naive.as_retriever(search_kwargs={'k': 5})

print("\n" + "="*80 + "\n")

# Create the contextual vector store using the same safe parameters
print("Creating contextual vector store in batches (TPM-aware)...")
vectorstore_contextual = create_vectorstore_in_batches(
    documents=contextual_chunks, 
    embedding_model=base_embeddings,
    batch_size=SAFE_BATCH_SIZE,
    delay_seconds=SAFE_DELAY_SECONDS
)
retriever_contextual = vectorstore_contextual.as_retriever(search_kwargs={'k': 5})

print("\n✅ Vector stores are ready.")

Creating naive vector store in batches (TPM-aware)...
  Processing batch 1/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 2/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 3/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 4/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 5/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 6/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 7/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 8/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 9/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 10/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 11/28...
  Waiting for 8 second(s) to respect rate limits...
  Processing batch 12/28...
  Waiting for 8 second(s) to respect rate limit

In [19]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

compressor = CohereRerank(
    model="rerank-english-v3.0", 
    top_n=3, 
)

compression_retriever_naive = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_naive
)

compression_retriever_contextual = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_contextual
)

In [22]:
query = "Nike's sales statistics"
print(f"\nQUERY: '{query}'\n")

# --- Test the Naive Retriever + Cohere Reranker ---
print("--- 1. Testing Naive Retriever + Cohere Reranker ---")
reranked_naive_results = compression_retriever_naive.invoke(query)

for i, doc in enumerate(reranked_naive_results):
    # The CohereRerank compressor automatically adds the 'relevance_score' to the metadata
    score = doc.metadata['relevance_score']
    print(f"\n[Rank {i+1}] Relevance Score: {score:.4f}")
    print(f"Content: {doc.page_content}")

print("\n" + "="*80 + "\n")

# --- Test the Contextual Retriever + Cohere Reranker ---
print("--- 2. Testing Contextual Retriever + Cohere Reranker ---")
reranked_contextual_results = compression_retriever_contextual.invoke(query)

for i, doc in enumerate(reranked_contextual_results):
    score = doc.metadata['relevance_score']
    print(f"\n[Rank {i+1}] Relevance Score: {score:.4f}")
    print(f"Content: {doc.page_content}")


QUERY: 'Nike's sales statistics'

--- 1. Testing Naive Retriever + Cohere Reranker ---

[Rank 1] Relevance Score: 0.9900
Content: NIKE Brand Revenues by:
Sales to Wholesale Customers $ 27,397 $ 25,608  7 %  14 % $ 25,898  -1 %  -1 %
Sales through NIKE Direct  21,308  18,726  14 %  20 %  16,370  14 %  15 %
Global Brand Divisions(2) 58  102  -43 %  -43 %  25  308 %  302 %
TOTAL NIKE BRAND REVENUES $ 48,763 $ 44,436  10 %  16 % $ 42,293  5 %  6 %
NIKE Brand Revenues on a Wholesale Equivalent 
Basis(1):
Sales to Wholesale Customers $ 27,397 $ 25,608  7 %  14 % $ 25,898  -1 %  -1 %
Sales from our Wholesale Operations to NIKE Direct 
Operations  12,730  10,543  21 %  27 %  9,872  7 %  7 %
TOTAL NIKE BRAND WHOLESALE EQUIVALENT 
REVENUES $ 40,127 $ 36,151  11 %  18 % $ 35,770  1 %  1 %
NIKE Brand Wholesale Equivalent Revenues by:(1),(4)
Men's $ 20,733 $ 18,797  10 %  17 % $ 18,391  2 %  3 %
Women's  8,606  8,273  4 %  11 %  8,225  1 %  1 %
NIKE Kids'  5,038  4,874  3 %  10 %  4,882  0 %  0 %
