In [51]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from fastembed import TextEmbedding
import chromadb
from sentence_transformers import CrossEncoder
from IPython.display import display, Markdown
import os
import json

os.environ["TOKENIZERS_PARALLELISM"] = "false"
OLLAMA_HOST = 'http://localhost:5050'


In [18]:
openai_api_key = os.environ['OPENAI_API_KEY']
model = ChatOpenAI(
  model="gpt-4o-mini",
  temperature=0,
  seed=42,
  api_key=openai_api_key
)

In [21]:
doc_loader = PyPDFLoader(
  "./data/docs/2023-amazon-annual-letter.pdf",
)
docs = doc_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_splits = text_splitter.split_documents(docs)

text_embedding = TextEmbedding()
embedding_values = list(text_embedding.embed([doc.page_content for doc in doc_splits]))

client = chromadb.Client()
if "rag" in [collection for collection in client.list_collections()]:
  client.delete_collection("rag")
collection = client.create_collection("rag")
collection.add(
  documents=[doc.page_content for doc in doc_splits],
  metadatas=[doc.metadata for doc in doc_splits],
  ids=[str(i) for i in range(len(doc_splits))],
  embeddings=embedding_values,
)

## Section 6

### Advanced Retrieval & Reranking

Sometimes a single query might not capture all nuances of a user's question. For example, if the query is ambiguous or contains uncommon terminology, some relevant documents might be missed.

**Approach:**  
- Fire multiple queries (e.g., using slight query variations or rewrites).  
- Combine the results and then use a reranker to sort them by relevance.

### Query Decomposition

In [22]:
DECOMPOSE_QUERY_PROMPT = """
You are an expert financial analyst. Given the following complex financial question:
"{query}"
generate {num_variants} distinct and concise search queries that would help retrieve the most relevant information from a financial document.
Output your answer as a JSON array of strings, no markdown.
"""

def generate_query_variants(query, num_variants=3):
  prompt_template = PromptTemplate(
    template=DECOMPOSE_QUERY_PROMPT,
    input_variables=["query", "num_variants"]
  )

  query_chain = prompt_template | model
  try:
    output = query_chain.invoke({"query": query, "num_variants": num_variants}).content
    # print(output)
    # Parse the JSON output
    variants = json.loads(output)
    if not isinstance(variants, list):
      raise ValueError("Output is not a list.")
  except Exception as e:
    print("Error generating query variants:", e)
    variants = [query]  # Fallback to the original query if needed
  return variants

In [23]:
q = 'What are the key financial drivers in Amazon\'s 2023 annual report?'
query_variants = generate_query_variants(query=q, num_variants=3)
query_variants

['Amazon 2023 annual report key financial drivers',
 'Amazon 2023 financial performance analysis',
 'Amazon 2023 revenue growth and profitability factors']

### Multi Query Retrieval

In [24]:
def multi_query_search_variants(query_variants, collection, n_results=3):
  combined_results = {"documents": [], "metadatas": [], "embeddings": []}
  for q in query_variants:
    ret = collection.query(query_texts=[q], n_results=n_results, include=['documents', 'metadatas', 'embeddings'])
    combined_results["documents"].extend(ret['documents'][0])
    combined_results["metadatas"].extend(ret['metadatas'][0])
    if 'embeddings' in ret and ret['embeddings']:
      combined_results["embeddings"].extend(ret['embeddings'][0])
  return combined_results

In [25]:
combined_results = multi_query_search_variants(query_variants, collection, n_results=3)

In [26]:
len(combined_results['documents'])

9

### Reranking

In [27]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_results_with_crossencoder(results, query, n_results = 3):
  pairs = [(query, doc) for doc in results["documents"]]
  scores = reranker.predict(pairs)

  reranked = sorted(zip(results["documents"], results["metadatas"], scores), key=lambda x: x[2], reverse=True)
  top_docs, top_metadatas, top_scores = zip(*reranked[:n_results]) if reranked else ([], [], [])
  return {"documents": list(top_docs), "metadatas": list(top_metadatas), "scores": list(top_scores)}

In [28]:
reranked_results = rerank_results_with_crossencoder(combined_results, q, n_results=6)

In [29]:
reranked_results['documents'][0], reranked_results['scores'][0]

('Report of Independent Registered Public Accounting Firm\nThe Board of Directors and Shareholders\nAmazon.com, Inc.\nOpinion on the Financial Statements\nWe have audited the accompanying consolidated balance sheets of Amazon.com, Inc. (the Company) as of December 31, \n2023 and 2022, the related consolidated statements of operations, comprehensive income (loss), stockholders’ equity, and cash \nflows for each of the three years in the period ended December 31, 2023, and the related notes (collectively referred to as the \n“consolidated financial statements”). In our opinion, the consolidated financial statements present fairly, in all material respects, \nthe financial position of the Company at December 31, 2023 and 2022, and the results of its operations and its cash flows for \neach of the three years in the period ended December 31, 2023, in conformity with U.S. generally accepted accounting \nprinciples.',
 1.1110415)

In [35]:
prompt = PromptTemplate(
  template="""You are a finacial assistant for question-answering tasks.
    Use the following documents to answer the question.
    If you don't know the answer, just say that you don't know.
    Keep the answer concise:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
  input_variables=["question", "documents"],
)

rag_chain = prompt | model

### Let's test the new system

In [36]:
test_questions = [
"""Analyze Amazon's 2023 financial performance by assessing how the growth in its AWS cloud 
services and e-commerce operations contributed to overall revenue, and identify the specific cost-cutting 
measures implemented to counteract rising raw material costs and global supply chain disruptions.
""",
"""
Evaluate Amazon's 2023 performance by quantifying the impact of its AWS infrastructure investments 
and e-commerce expansion on revenue growth, while also detailing the strategic cost reductions in logistics 
and supply chain operations to offset global economic headwinds.
""",
"""
Analyze the drivers behind Amazon's 2023 financial results by examining how increased spending in cloud services 
and digital advertising contributed to overall revenue, and identify the specific operational efficiencies 
implemented to reduce overhead costs amidst market volatility.
""",
"""
Assess Amazon's 2023 annual report by breaking down the revenue contributions from its diversified business 
units—such as AWS, e-commerce, and subscription services—and by pinpointing the targeted cost-containment 
measures executed to combat rising raw material and transportation expenses.
"""
]

In [37]:
test_question = test_questions[1]
print(test_question)


Evaluate Amazon's 2023 performance by quantifying the impact of its AWS infrastructure investments 
and e-commerce expansion on revenue growth, while also detailing the strategic cost reductions in logistics 
and supply chain operations to offset global economic headwinds.



#### Check Response from Basic RAG

In [48]:
basic_rag_results = multi_query_search_variants([test_question], collection, n_results=6)
response_basic = rag_chain.invoke({"question": q, "documents": basic_rag_results['documents']})
display(Markdown(response_basic.content))

The key financial drivers in Amazon's 2023 annual report include:

1. **Revenue Growth**: Total revenue increased by 12% year-over-year, from $514 billion to $575 billion, with notable growth in North America (12%), International (11%), and AWS (13%).

2. **Operating Income Improvement**: Operating income rose significantly by 201% year-over-year, from $12.2 billion to $36.9 billion, with an improved operating margin from 2.4% to 6.4%.

3. **Free Cash Flow**: Free Cash Flow improved dramatically from -$12.8 billion in 2022 to $35.5 billion in 2023.

4. **Fulfillment and Delivery Efficiency**: Enhanced delivery speeds and fulfillment network efficiencies contributed to increased sales and customer satisfaction.

5. **AWS Growth**: AWS sales growth was driven by increased customer usage, despite some pricing changes.

6. **Cost Management**: While fulfillment and technology costs increased, they were partially offset by efficiencies and increased sales.

#### Check Response from Advanced RAG

In [49]:
query_variants = generate_query_variants(query=test_question, num_variants=6)
combined_results = multi_query_search_variants(query_variants, collection, n_results=3)
reranked_results = rerank_results_with_crossencoder(combined_results, test_question, n_results=6)
response_advanced = rag_chain.invoke({"question": test_question, "documents": reranked_results})
display(Markdown(response_advanced.content))

In 2023, Amazon's total revenue grew 12% year-over-year, increasing from $514 billion to $575 billion. AWS revenue specifically rose by 13%, from $80 billion to $91 billion, reflecting increased customer usage despite some pricing changes. The North America segment also saw a 12% revenue increase, while International revenue grew by 11%.

Strategically, Amazon implemented cost reductions in logistics and supply chain operations, which contributed to a significant improvement in operating income, rising 201% from $12.2 billion in 2022 to $36.9 billion in 2023. This was achieved despite increased fulfillment and shipping costs, indicating effective management of operational expenses to counter global economic challenges. Free Cash Flow also improved dramatically, from -$12.8 billion in 2022 to $35.5 billion in 2023.