# Notebook Setup
<a target="_blank" href="https://colab.research.google.com/github/PacktPublishing/Generative-AI-Integration-Patterns-1E/blob/main/04-Real-Time-QA-RAG/Chapter_8_Integration_pattern_Real_Time_retrieval_augmented_generation.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
#Install dependencies

!pip install --upgrade google-cloud-aiplatform
!pip install --upgrade langchain_community langchain_google_vertexai langchain_chroma unstructured[pdf]

Collecting langchain_community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_google_vertexai
  Downloading langchain_google_vertexai-1.0.4-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_chroma
  Downloading langchain_chroma-0.1.1-py3-none-any.whl (8.5 kB)
Collecting unstructured[pdf]
  Downloading unstructured-0.14.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.0 (from langchain_community)
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB

In [None]:
#Authenticate
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
import base64
import json

#VertexAI
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models
from google.cloud import aiplatform

# Langchain
from langchain_community.document_loaders import TextLoader, UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.cleaners.core import clean_extra_whitespace
from langchain_google_vertexai import VertexAIEmbeddings


#Markdown
from IPython.display import display, Markdown, Latex


In [None]:
PROJECT = "testproject-410220"#@param {type:"string"}
LOCATION = "us-central1"#@param {type:"string"}
MODEL = "gemini-1.5-flash-001"#@param {type:"string"}
EMBEDDINGS_MODEL = "text-embedding-004"#@param {type:"string"}
MAX_RESULTS = 4#@param {type:"number"}

# Vector database initialization and ingestion


In [None]:
#@title Dataset Download

!wget https://d1io3yog0oux5.cloudfront.net/_c38ec26158c6d5493f3fce02d606a6a1/cocacolacompany/db/764/8109/file/CORRECTED+TRANSCRIPT_+The+Coca-Cola+Co.%28KO-US%29%2C+Q1+2024+Earnings+Call%2C+30-April-2024+8_30+AM+ET.pdf -O coca_cola_earnings_call_2023.pdf

--2024-05-30 18:53:30--  https://d1io3yog0oux5.cloudfront.net/_c38ec26158c6d5493f3fce02d606a6a1/cocacolacompany/db/764/8109/file/CORRECTED+TRANSCRIPT_+The+Coca-Cola+Co.%28KO-US%29%2C+Q1+2024+Earnings+Call%2C+30-April-2024+8_30+AM+ET.pdf
Resolving d1io3yog0oux5.cloudfront.net (d1io3yog0oux5.cloudfront.net)... 13.225.95.176, 13.225.95.4, 13.225.95.203, ...
Connecting to d1io3yog0oux5.cloudfront.net (d1io3yog0oux5.cloudfront.net)|13.225.95.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 300972 (294K) [application/pdf]
Saving to: ‘coca_cola_earnings_call_2023.pdf’


2024-05-30 18:53:31 (335 KB/s) - ‘coca_cola_earnings_call_2023.pdf’ saved [300972/300972]



In [None]:
# load the document and split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=200)
loader = UnstructuredPDFLoader("coca_cola_earnings_call_2023.pdf",post_processors=[clean_extra_whitespace])
pages = loader.load_and_split(text_splitter=text_splitter)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
#Check what is in the chunks
print(pages[1])

page_content="All participants will be on listen-only mode until the formal question-and-answer portion of the call. I would like to remind everyone that the purpose of this conference is to talk with investors and, therefore, questions from the media will not be addressed. Media participants should contact Coca-Cola's Media Relations department if they have any questions.\n\nI would now like to introduce Ms. Robin Halpern, Vice President and Head of Investor Relations. Ms. Halpern, you may now begin. ..................................................................................................................................................................................................................................................................... Robin Halpern Vice President & Head-Investor Relations, The Coca-Cola Co.\n\nGood morning, and thank you for joining us. I am here with: James Quincey, our Chairman and Chief Executive Officer; and John Murphy, our President and Ch

In [None]:
#Init VertexAI Platform
aiplatform.init(project=PROJECT, location=LOCATION)
embeddings_function = VertexAIEmbeddings(model=EMBEDDINGS_MODEL)

In [None]:
#Create a local instance of ChromaDB
from langchain_chroma import Chroma

# Generate embeddings and load them into ChromDB
db = Chroma.from_documents(pages, embeddings_function)

In [None]:
#Test query

# Test query
query = "Who is the call for?"
docs = db.similarity_search(query,k=MAX_RESULTS)

# Print results
print(docs[0].page_content)

All participants will be on listen-only mode until the formal question-and-answer portion of the call. I would like to remind everyone that the purpose of this conference is to talk with investors and, therefore, questions from the media will not be addressed. Media participants should contact Coca-Cola's Media Relations department if they have any questions.

I would now like to introduce Ms. Robin Halpern, Vice President and Head of Investor Relations. Ms. Halpern, you may now begin. ..................................................................................................................................................................................................................................................................... Robin Halpern Vice President & Head-Investor Relations, The Coca-Cola Co.

Good morning, and thank you for joining us. I am here with: James Quincey, our Chairman and Chief Executive Officer; and John Murphy, our President and Chief Financial Offi

In [None]:
#@title RAG Logic

#In this section we define the prompt, as the task is to perform intent classification we will identify the intent by exposing the possible values to the LLM
prompt_template = """
You are a helpful assistant for an online financial services company that allows users to check their balances, invest in certificates of deposit (CDs), and perform other financial transactions.

Your task is to answer questions from your customers, in order to do so follow these rules:

1. Carefully analyze the question you received.
2. Carefully analyze the context provided.
3. Answer the question using ONLY the information provided in the context, NEVER make up information
4. Always think step by step.

<context>
{context}
</context>
User question: {query}
Answer:
"""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

def get_context(query, db, number_of_results):
  context_string = ""
  docs = db.similarity_search(query,k=number_of_results)
  for doc in docs:
    new_context = f"""\n---This information is contained in a document called {doc.metadata["source"]} \n\n {doc.page_content}\n\n---"""
    context_string = context_string+new_context
  return(context_string)


def generate(prompt):
  model = GenerativeModel(MODEL)
  responses = model.generate_content(
      [prompt],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=False,
  )
  return(responses)



# Entry Point

In [None]:
#In this case we will simulate the input from a chat interface

question = "What is this call about?"

# Prompt Preprocessing

In [None]:
#In this step we will query the vector database with the question received, and then populate the promp template with both the question and the context
context = get_context(question, db, MAX_RESULTS)
prompt = prompt_template.format(query=question, context=context)

In [None]:
print(prompt)


You are a helpful assistant for an online financial services company that allows users to check their balances, invest in certificates of deposit (CDs), and perform other financial transactions.

Your task is to answer questions from your customers, in order to do so follow these rules:

1. Carefully analyze the question you received.
2. Carefully analyze the context provided.
3. Answer the question using ONLY the information provided in the context, NEVER make up information
4. Always think step by step.

<context>

---This information is contained in a document called coca_cola_earnings_call_2023.pdf 

 1-877-FACTSET www.callstreet.com

18 Copyright © 2001-2024 FactSet CallStreet, LLC

The Coca-Cola Co. (KO) Q1 2024 Earnings Call

Corrected Transcript 30-Apr-2024

Operator: Ladies and gentlemen, this concludes today's conference call. Thank you for participating. You may now disconnect.

Disclaimer The information herein is based on sources we believe to be reliable but is not guara

# Inference

In [None]:
#This is the section where we submit the full prompt and context to the LLM
result = generate(prompt)

# Result Postprocessing

In [None]:
#In this section you can format the answer for example with markdown
formatted_result = f"###Question:\n{question}\n\n###Answer:\n{result.text}\n\n<details><summary>Context</summary>{context}</details>"

# Result Presentation

In [None]:
display(Markdown(formatted_result))

###Question:
What is this call about?

###Answer:
This call is about The Coca-Cola Company's First Quarter 2024 Earnings Results. 


<details><summary>Context</summary>
---This information is contained in a document called coca_cola_earnings_call_2023.pdf 

 1-877-FACTSET www.callstreet.com

18 Copyright © 2001-2024 FactSet CallStreet, LLC

The Coca-Cola Co. (KO) Q1 2024 Earnings Call

Corrected Transcript 30-Apr-2024

Operator: Ladies and gentlemen, this concludes today's conference call. Thank you for participating. You may now disconnect.

Disclaimer The information herein is based on sources we believe to be reliable but is not guaranteed by us and does not purport to be a complete or error-free statement or summary of the available data. As such, we do not warrant, endorse or guarantee the completeness, accuracy, integrity, or timeliness of the information. You must evaluate, and bear all risks associated with, the use of any information provided hereunder, including any reliance on the accuracy, completeness, safety or usefulness of such information. This information is not intended to be used as the prim ary basis of investment decisions. It should not be construed as advice designed to meet the particular investment needs of any investor. This report is published solely for information purposes, and is not to be construed as financial or other advice or as an offer to sell or the solicitation of an offer to buy any security in any state where such an offer or solicitation would be illegal. Any information expressed herein on this date is subject to change without notice. Any opinions or assertions contained in this i nformation do not represent the opinions or beliefs of FactSet CallStreet, LLC. FactSet CallStreet, LLC, or one or more of its employees, including the writer of this report, may have a position in any of the securities discussed herein.

---
---This information is contained in a document called coca_cola_earnings_call_2023.pdf 

 All participants will be on listen-only mode until the formal question-and-answer portion of the call. I would like to remind everyone that the purpose of this conference is to talk with investors and, therefore, questions from the media will not be addressed. Media participants should contact Coca-Cola's Media Relations department if they have any questions.

I would now like to introduce Ms. Robin Halpern, Vice President and Head of Investor Relations. Ms. Halpern, you may now begin. ..................................................................................................................................................................................................................................................................... Robin Halpern Vice President & Head-Investor Relations, The Coca-Cola Co.

Good morning, and thank you for joining us. I am here with: James Quincey, our Chairman and Chief Executive Officer; and John Murphy, our President and Chief Financial Officer.

We've posted schedules under Financial Information in the Investor section of our company website. These reconcile certain non-GAAP financial measures that may be referred to this morning to results as reported under generally accepted accounting principles. You can also find schedules in the same section of our website that provide an analysis of our growth and operating margins.

This call may contain forward-looking statements, including statements concerning long-term earnings objectives, which should be considered in conjunction with cautionary statements contained in our earnings release and in the company's periodic SEC reports.

Following prepared remarks, we will take your questions. Please limit yourself to one question. Reenter the queue to ask any follow-ups.

---
---This information is contained in a document called coca_cola_earnings_call_2023.pdf 

 30-Apr-2024 The Coca-Cola Co. (KO) Q1 2024 Earnings Call

1-877-FACTSET www.callstreet.com

Corrected Transcript

Total Pages: 19 Copyright © 2001-2024 FactSet CallStreet, LLC

The Coca-Cola Co. (KO) Q1 2024 Earnings Call

Corrected Transcript 30-Apr-2024

CORPORATE PARTICIPANTS

Robin Halpern Vice President & Head-Investor Relations, The Coca-Cola Co.

John Murphy President & Chief Financial Officer, The Coca-Cola Co.

James Quincey Chairman & Chief Executive Officer, The Coca-Cola Co.

.....................................................................................................................................................................................................................................................................

OTHER PARTICIPANTS

Bryan D. Spillane Analyst, BofA Securities, Inc.

Peter Grom Analyst, UBS Securities LLC

Dara Mohsenian Analyst, Morgan Stanley & Co. LLC

William B. Chappell Analyst, Truist Securities, Inc.

Lauren R. Lieberman Analyst, Barclays Capital, Inc.

Carlos Laboy Analyst, HSBC Securities (USA), Inc.

Steve Powers Analyst, Deutsche Bank Securities, Inc.

Robert Moskow Analyst, TD Cowen

Bonnie Herzog Analyst, Goldman Sachs & Co. LLC

Robert Ottenstein Analyst, Evercore ISI

Andrea Teixeira Analyst, JPMorgan Securities LLC

Callum Elliott Analyst, Bernstein Autonomous LLP

Chris Carey Analyst, Wells Fargo Securities LLC

Brett John Cooper Analyst, Consumer Edge Research LLC

Filippo Falorni Analyst, Citigroup Global Markets, Inc.

1-877-FACTSET www.callstreet.com

2 Copyright © 2001-2024 FactSet CallStreet, LLC

The Coca-Cola Co. (KO) Q1 2024 Earnings Call

Corrected Transcript 30-Apr-2024

MANAGEMENT DISCUSSION SECTION

Operator: At this time, I'd like to welcome everyone to The Coca-Cola Company's First Quarter 2024 Earnings Results Conference Call. Today's call is being recorded. If you have any objections, please disconnect at this time.

---
---This information is contained in a document called coca_cola_earnings_call_2023.pdf 

 Operator: Our next question comes from Robert Moskow from TD Cowen. Please go ahead. Your line is open. .....................................................................................................................................................................................................................................................................

Robert Moskow Analyst, TD Cowen

Q

Hi, there. Just want to ask a couple of clarifying questions. James, I think on the last earnings call you were very clear that you viewed the business - .....................................................................................................................................................................................................................................................................

James Quincey Chairman & Chief Executive Officer, The Coca-Cola Co.

A

1-877-FACTSET www.callstreet.com

15 Copyright © 2001-2024 FactSet CallStreet, LLC

The Coca-Cola Co. (KO) Q1 2024 Earnings Call

Corrected Transcript 30-Apr-2024

Can you speak up? We can't hear you. .....................................................................................................................................................................................................................................................................

Robert Moskow Analyst, TD Cowen

Q

My apologies. Can you hear me now? .....................................................................................................................................................................................................................................................................

James Quincey Chairman & Chief Executive Officer, The Coca-Cola Co.

A

Yes. .....................................................................................................................................................................................................................................................................

---</details>

# Demo

In [None]:
#In this case we will use a Gradio interface to interact with the system

#Install Gradio

!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-4.31.5-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.4 (from gradio)
  Downloading gradio_client-0.16.4-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.9/315.9 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.4.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCol

In [None]:
import gradio as gr

def answer_question(query, db, number_of_results):
  context = get_context(query, db, number_of_results)
  answer = generate(prompt_template.format(query=query, context=context))
  return(answer.text)

def chat(message, history):
    response = answer_question(message,db, MAX_RESULTS)
    history.append((message, response))
    return "", history


with gr.Blocks() as demo:
  gr.Markdown("Fintech Assistant")
  chatbot = gr.Chatbot(show_label=False)
  message = gr.Textbox(placeholder="Enter your question")
  message.submit(chat, [message, chatbot],[message, chatbot]  )

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://0bd3d4641e73ef6f9a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://0bd3d4641e73ef6f9a.gradio.live


