In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

# Set up the root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Set logger level to INFO

# Clear out any existing handlers
logger.handlers = []

# Set up the StreamHandler to output to sys.stdout (Colab's output)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)  # Set handler level to INFO

# Add the handler to the logger
logger.addHandler(handler)

In [4]:
import logging
import sys
import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

NumExpr defaulting to 2 threads.


In [5]:
%%capture
!pip install llama_index
!pip install transformers
!pip install llama-index-embeddings-huggingface

In [6]:
from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.readers.file import FlatReader
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex
import os

In [7]:
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    RetrieverEvaluator,
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset)

In [8]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [9]:
loader = PDFReader()
documents = loader.load_data(file=Path('/content/drive/MyDrive/Gen AI/RAG/annual_filings/0000021344-23-000011.pdf'))

In [10]:
parser = SimpleFileNodeParser()
nodes = parser.get_nodes_from_documents(documents)

In [11]:
len(nodes)

183

In [12]:
gpt35 = OpenAI(temperature=0, model="gpt-3.5-turbo")

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
vector_store_index = VectorStoreIndex(nodes, embed_model=embed_model, llm=gpt35, show_progress=False)

In [33]:
qa_list = [
    {"question": "What is the primary business of The Coca-Cola Company?", "answer": "The Coca-Cola Company is a total beverage company, offering a wide range of beverages, including sparkling soft drinks, waters, sports drinks, juices, teas, coffees, and plant-based beverages, sold in over 200 countries."},
    {"question": "What are Coca-Cola's three pillars for growth?", "answer": "The three pillars are: Loved Brands, Done Sustainably, and For a Better Shared Future."},
    {"question": "How many servings of Coca-Cola products are consumed worldwide daily?", "answer": "Coca-Cola products account for approximately 2.2 billion servings out of the 64 billion servings of beverages consumed worldwide daily."},
    {"question": "Which are the top five nonalcoholic sparkling soft drink brands Coca-Cola owns?", "answer": "The top five brands are Coca-Cola, Sprite, Fanta, Coca-Cola Zero Sugar, and Diet Coke/Coca-Cola Light."},
    {"question": "What percentage of Coca-Cola’s global unit case volume in 2022 was attributed to sparkling soft drinks?", "answer": "Sparkling soft drinks represented 69% of Coca-Cola's global unit case volume in 2022."},
    {"question": "What are the main ingredients Coca-Cola uses in its beverages?", "answer": "The primary ingredients include high fructose corn syrup, sucrose, aspartame, sucralose, citric acid, phosphoric acid, caffeine, caramel color, water, and various juice concentrates."},
    {"question": "What major competitors does Coca-Cola face globally?", "answer": "Coca-Cola's primary competitors include PepsiCo, Nestlé, Keurig Dr Pepper, Suntory Beverage & Food, Unilever, Red Bull, and several regional and local brands."},
    {"question": "What are the company’s biggest markets outside the U.S.?", "answer": "The largest markets outside the U.S. in terms of unit case volumes are Mexico, China, Brazil, and India."},
    {"question": "What are some of the bottling partners of Coca-Cola?", "answer": "Major bottling partners include Coca-Cola FEMSA, Coca-Cola Europacific Partners, Coca-Cola HBC AG, Arca Continental, and Swire Coca-Cola Limited."},
    {"question": "What sustainability goals does Coca-Cola aim to achieve by 2030?", "answer": "Coca-Cola aims to have 50% of its global leadership positions held by women and to reflect the U.S. Census' racial and ethnic representation at all job levels in the U.S."}
]


In [16]:
from llama_index.core.retrievers import AutoMergingRetriever

In [17]:
from llama_index.core.retrievers import VectorIndexAutoRetriever

## Sentenc Window Retriever

In [19]:
from llama_index.core.node_parser import SentenceWindowNodeParser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [20]:
nodes_sentence_window = node_parser.get_nodes_from_documents(documents)

In [None]:
sentence_index = VectorStoreIndex(nodes_sentence_window, embed_model=embed_model, llm=gpt35, show_progress=False)

In [22]:
sentence_query_engine = sentence_index.as_query_engine()

In [23]:
# define storage context
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage import StorageContext
# from llama_index import ServiceContext
# from llama_index.llms import OpenAI

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

# service_context = ServiceContext.from_defaults(
#     llm=OpenAI(model="gpt-3.5-turbo")
# )

## Auto Merging Retriever

In [24]:
base_retriever = vector_store_index.as_retriever(similarity_top_k=6)
auto_merging_retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

## Auto Retriever

In [25]:
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo

In [26]:
# Define the content and metadata of the PDF
vector_store_info = VectorStoreInfo(
    content_info="The Coca-Cola Company 10-K financial report for the fiscal year ending 2022",
    metadata_info=[
        MetadataInfo(
            name="report_section",
            type="str",
            description="Section of the report, one of [Business, Risk Factors, Financial Statements, Legal Proceedings]",
        ),
        MetadataInfo(
            name="fiscal_year",
            type="int",
            description="The fiscal year for which the report is created (e.g., 2022)"
        ),
        MetadataInfo(
            name="geography",
            type="str",
            description="Geographical focus in the report, one of [United States, Latin America, Europe, Asia Pacific]",
        ),
        MetadataInfo(
            name="segment",
            type="str",
            description="Operational segment discussed, one of [North America, EMEA, Latin America, Global Ventures, Bottling Investments]"
        )
    ]
)

In [27]:
auto_retriever = VectorIndexAutoRetriever(
    index=vector_store_index,  # Your pre-built vector index
    vector_store_info=vector_store_info
)

## Query Engines

In [28]:
from llama_index.core.query_engine import RetrieverQueryEngine

In [29]:
auto_merging_query_engine = RetrieverQueryEngine.from_args(auto_merging_retriever)
auto_query_engine = RetrieverQueryEngine.from_args(auto_retriever)

In [30]:
relevancy_evaluator = RelevancyEvaluator(llm=gpt35)
faithfulness_evaluator = FaithfulnessEvaluator(llm=gpt35)
correctness_evaluator = CorrectnessEvaluator(llm=gpt35)

In [31]:
from tqdm import tqdm

In [35]:
qa_list = qa_list[0:5]

## Performance: Sentence Window Retriever

In [None]:
relevancy_correct = 0
faithfulness_correct = 0
correctness_correct = 0

for pair in tqdm(qa_list):
  question = pair['question']
  answer = pair['answer']

  response_vector = sentence_query_engine.query(question)

  relevancy_eval_result = relevancy_evaluator.evaluate_response(query=question, response=response_vector)
  faithfullness_eval_result = faithfulness_evaluator.evaluate_response(response=response_vector)
  # correctness_eval_result = correctness_evaluator.evaluate(
  #   query=question,
  #   response=response_vector.response,
  #   reference=answer,
  # )

  if relevancy_eval_result.passing:
    relevancy_correct += 1
  if faithfullness_eval_result.passing:
    faithfulness_correct += 1
  # if correctness_eval_result.passing:
  #   correctness_correct += 1

relevancy_score = relevancy_correct/len(qa_list)
faithfulness_score = faithfulness_correct/len(qa_list)
# correctness_score = correctness_correct/len(qa_list)

In [37]:
import pandas as pd
df = {'mean_relevancy_score': [relevancy_score], 'mean_faithfulness_score': [faithfulness_score]}
df = pd.DataFrame(df)
df

Unnamed: 0,mean_relevancy_score,mean_faithfulness_score
0,0.8,1.0


## Performance: Auto Merging Retriever

In [None]:
relevancy_correct = 0
faithfulness_correct = 0
correctness_correct = 0

for pair in tqdm(qa_list):
  question = pair['question']
  answer = pair['answer']

  response_vector = auto_merging_query_engine.query(question)

  relevancy_eval_result = relevancy_evaluator.evaluate_response(query=question, response=response_vector)
  faithfullness_eval_result = faithfulness_evaluator.evaluate_response(response=response_vector)
  # correctness_eval_result = correctness_evaluator.evaluate(
  #   query=question,
  #   response=response_vector.response,
  #   reference=answer,
  # )

  if relevancy_eval_result.passing:
    relevancy_correct += 1
  if faithfullness_eval_result.passing:
    faithfulness_correct += 1
  # if correctness_eval_result.passing:
  #   correctness_correct += 1

relevancy_score = relevancy_correct/len(qa_list)
faithfulness_score = faithfulness_correct/len(qa_list)
# correctness_score = correctness_correct/len(qa_list)

In [40]:
import pandas as pd
df = {'mean_relevancy_score': [relevancy_score], 'mean_faithfulness_score': [faithfulness_score]}
df = pd.DataFrame(df)
df

Unnamed: 0,mean_relevancy_score,mean_faithfulness_score
0,1.0,0.6


## Performance: Auto Retriever

In [None]:
relevancy_correct = 0
faithfulness_correct = 0
correctness_correct = 0

for pair in tqdm(qa_list):
  question = pair['question']
  answer = pair['answer']

  response_vector = auto_query_engine.query(question)

  relevancy_eval_result = relevancy_evaluator.evaluate_response(query=question, response=response_vector)
  faithfullness_eval_result = faithfulness_evaluator.evaluate_response(response=response_vector)
  # correctness_eval_result = correctness_evaluator.evaluate(
  #   query=question,
  #   response=response_vector.response,
  #   reference=answer,
  # )

  if relevancy_eval_result.passing:
    relevancy_correct += 1
  if faithfullness_eval_result.passing:
    faithfulness_correct += 1
  # if correctness_eval_result.passing:
  #   correctness_correct += 1

relevancy_score = relevancy_correct/len(qa_list)
faithfulness_score = faithfulness_correct/len(qa_list)
# correctness_score = correctness_correct/len(qa_list)

In [42]:
import pandas as pd
df = {'mean_relevancy_score': [relevancy_score], 'mean_faithfulness_score': [faithfulness_score]}
df = pd.DataFrame(df)
df

Unnamed: 0,mean_relevancy_score,mean_faithfulness_score
0,0.4,0.4


## Sub Question Query Engine

In [43]:
documents_2023 = loader.load_data(file=Path('/content/drive/MyDrive/Gen AI/RAG/annual_filings/0000021344-23-000011.pdf'))
documents_2022 = loader.load_data(file=Path('/content/drive/MyDrive/Gen AI/RAG/annual_filings/0000021344-22-000009.pdf'))

In [None]:
index_2023 = VectorStoreIndex.from_documents(documents_2023)
index_2022 = VectorStoreIndex.from_documents(documents_2022)

In [45]:
engine_2023 = index_2023.as_query_engine(similarity_top_k=3)
engine_2022 = index_2022.as_query_engine(similarity_top_k=3)

In [46]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [47]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=engine_2023,
        metadata=ToolMetadata(name='engine_2023', description='Provides information about Coca-Cola 10-K filings for the year 2023')
    ),
    QueryEngineTool(
        query_engine=engine_2022,
        metadata=ToolMetadata(name='engine_2022', description='Provides information about Coca-Cola 10-K filings for year 2022')
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)

In [50]:
response = await s_engine.aquery("Compare Coca-Cola's revenue growth from 2020 to 2021 and 2021 to 2022.")

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Generated 3 sub questions.
[1;3;38;2;237;90;200m[engine_2022] Q: What was Coca-Cola's revenue in 2020?
[0m[1;3;38;2;90;149;237m[engine_2022] Q: What was Coca-Cola's revenue in 2021?
[0m[1;3;38;2;11;159;203m[engine_2023] Q: What was Coca-Cola's revenue in 2022?
[0mHTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
HTTP Request: POST https://api.

In [51]:
print(response)

Coca-Cola's revenue grew by $5,641 million from 2020 to 2021 and by $4,349 million from 2021 to 2022.


In [52]:
response = await s_engine.aquery("How did Coca-Cola's operating expenses change between 2021 and 2022?")

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Generated 2 sub questions.
[1;3;38;2;237;90;200m[engine_2022] Q: What were Coca-Cola's operating expenses in 2021?
[0m[1;3;38;2;90;149;237m[engine_2023] Q: What were Coca-Cola's operating expenses in 2022?
[0mHTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /chat/completions in 20.000000 seconds
Retrying request to /chat/completions in 20.000000 seconds
HTTP Request: POST http

In [53]:
print(response)

Coca-Cola's operating expenses increased by $1,951 million from 2021 to 2022.


In [54]:
response = await s_engine.aquery("What were Coca-Cola’s major geographical market performances in 2021 versus 2022?")

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Generated 2 sub questions.
[1;3;38;2;237;90;200m[engine_2022] Q: What were Coca-Cola’s major geographical market performances in 2021?
[0m[1;3;38;2;90;149;237m[engine_2023] Q: What were Coca-Cola’s major geographical market performances in 2022?
[0mHTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;38;2;90;149;237m[engine_2023] A: Coca-Cola's major geographical market performances in 2022 included:
- Europe, Middle East, 

In [55]:
print(response)

In 2021, Coca-Cola experienced significant growth in various geographical markets, with notable increases in unit case volume across regions such as Europe, Middle East & Africa, Latin America, North America, Asia Pacific, Global Ventures, and Bottling Investments. The growth was driven by a variety of categories including sparkling flavors, hydration, sports, coffee, tea, and nutrition. In contrast, in 2022, Coca-Cola's major geographical market performances showed a more mixed picture with varying growth rates across regions. While some regions like Latin America and Global Ventures continued to see strong growth, others experienced a slowdown or decline in certain categories like juice and plant-based beverages in Europe, Middle East & Africa.


## Note
Due to the limitations of the free 5 credits offered by OpenAI, I was unable to load more than two PDFs simultaneously or use large queries to compare performance, as this exceeded the allowed limits. I explored and tested all retrievers and query engines mentioned in the assignment and conducted a performance comparison using OpenAI embeddings. However, further testing with larger queries would require an upgrade beyond the free credits, which I am unable to do at this time.