# Creating a RAG system with GeminiAPI

* Financial Q&A dataset from kaggle - https://www.kaggle.com/datasets/yousefsaeedian/financial-q-and-a-10k

* API key for Gemini can be obtained from Google AI Studio.

In [1]:
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/67.3 kB[0m [31m24.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/67.3 kB[0m [31m24.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m687.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4

In [2]:
#Importing the dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yousefsaeedian/financial-q-and-a-10k")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/yousefsaeedian/financial-q-and-a-10k?dataset_version_number=1...


100%|██████████| 736k/736k [00:00<00:00, 29.7MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/yousefsaeedian/financial-q-and-a-10k/versions/1





In [3]:
!ls /root/.cache/kagglehub/datasets/yousefsaeedian/financial-q-and-a-10k/versions/1

Financial-QA-10k.csv


In [4]:
import pandas as pd
df = pd.read_csv(path+"/Financial-QA-10k.csv")
df.head()

Unnamed: 0,question,answer,context,ticker,filing
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K


In [5]:
#Keeping only the answers, tickers and filing information in the dataset
df['train'] = df['answer'] + ' Ticker: ' + df['ticker'] + ". Filing: " + df['filing']
df['train'][0]

'NVIDIA initially focused on PC graphics. Ticker: NVDA. Filing: 2023_10K'

In [6]:
#Removing null and duplicate values in the dataset, since they can cause errors during embedding generation
df.drop_duplicates(inplace = True)
df.dropna(inplace = True)

In [7]:
#Importing libraries
from google import genai
from google.genai import types
from IPython.display import Markdown

In [8]:
#Importing the secret or api key. Note - Make sure that the API key has been added to Google Collab Secrets
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

In [9]:
#Listing all the models available that can work with embeddings
client = genai.Client(api_key = GEMINI_API_KEY)

for m in client.models.list():
  if "embedContent" in m.supported_actions:
    print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


In [10]:
#Using the text-embedding-004 model to create embeddings
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from google.genai import types

#Defining a helper to retry when the per-minute quota is reached
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in (429,503))

#Creating a class
class GeminiEmbeddings(EmbeddingFunction):
  #Specifying whether we are working with documents or queries
  document_mode = True

  #Creating a decorator that adds the retry behaviour to a function
  @retry.Retry(predicate = is_retriable)
  def __call__(self, input: Documents) -> Embeddings: #Creating a function that takes documents and returns embeddings
    #Checking the document mode
    if self.document_mode:
      embedding_task = "retrieval_document"
    else:
      embedding_task = "retrieval_query"

    #Configuring the response as per the model, input documents and document mode
    response = client.models.embed_content(
        model = "models/text-embedding-004",
        contents = input,
        config = types.EmbedContentConfig(
            task_type = embedding_task,
        ),
    )

    #Returning the embeddings of the documents
    return [e.values for e in response.embeddings]

In [11]:
#Create a database client for chromadb and populate it with the embeddings from class created above
import chromadb

db_name = 'financial_db'

embed_fn = GeminiEmbeddings() #Embedding function to be used is the class that was defined above
embed_fn.document_mode = True

#Creating a client and a collection with specified name and embedding function
chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name = db_name, embedding_function = embed_fn)

#Adding the documents in the database
#Only 1st 100 documents are added, due to API restrictions on creating more embeddings
db.add(documents = df['train'][:100].to_list(), ids = [str(i) for i in range(100)])

In [12]:
#Verifying that the documents were added
db.count()

100

In [13]:
#Viewing the 1st document added
db.peek(1)

{'ids': ['0'],
 'embeddings': array([[ 2.76086871e-02, -7.24821817e-04, -3.20609361e-02,
          1.89779364e-02,  2.35532802e-02, -4.18009609e-03,
          2.29011066e-02, -3.41083854e-02,  2.77577643e-03,
         -2.61388887e-02,  6.75756205e-03, -2.60492899e-02,
          8.76048133e-02,  9.71192867e-03,  3.25708166e-02,
         -5.23109846e-02,  1.36812488e-02,  1.86660495e-02,
         -1.36190459e-01,  2.56960765e-02,  1.28181651e-02,
         -2.83163767e-02,  5.12070581e-03, -1.64995249e-02,
         -4.71123755e-02, -4.29382212e-02, -5.81324287e-03,
         -3.41415741e-02, -9.84709337e-03, -6.44191876e-02,
          4.74068932e-02,  6.40123114e-02,  1.41468830e-02,
          4.39585047e-03, -2.88817193e-02, -2.01900490e-02,
          1.31185772e-02,  3.87931801e-02,  3.16681229e-02,
         -6.32507876e-02, -4.13327217e-02, -1.97742227e-02,
         -2.12269314e-02,  1.27656735e-04, -1.84572879e-02,
          3.33814248e-02,  1.89742743e-04,  5.44568896e-02,
         -2

In [14]:
#Switch to query mode when querying our collection
embed_fn.document_mode = False

#Search the chroma db for the specified query
query = "Explain NVIDIA's CUDA programming model and when did it create the GPU?"
results = db.query(query_texts=[query], n_results = 5) #Retrieving 5 most relevant answers

#Obtain only the document from the results and not the other data
[all_passages] = results['documents']

#Print the 1st result
Markdown(all_passages[0])

NVIDIA's CUDA programming model opened the parallel processing capabilities of GPUs for general purpose computing. Ticker: NVDA. Filing: 2023_10K

In [15]:
#Since we can obtain the relevant passage from the database for the query, we now pass it to Gemini to generate the final result

#Converting a multiline query into a single line
query_oneline = query.replace("\n"," ")

#Crafting a prompt
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
Be sure to break down complicated concepts and strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}"""

#Add the retrieved passages to the prompt
for passages in all_passages:
  passage_oneline = passages.replace("\n"," ")
  prompt += f"\nPassage: {passage_oneline}"

print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
Be sure to break down complicated concepts and strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: Explain NVIDIA's CUDA programming model and when did it create the GPU?
Passage: NVIDIA's CUDA programming model opened the parallel processing capabilities of GPUs for general purpose computing. Ticker: NVDA. Filing: 2023_10K
Passage: NVIDIA invented the GPU in 1999. Ticker: NVDA. Filing: 2023_10K
Passage: The NVIDIA computing platform includes energy-efficient GPUs, data processing units (DPUs), interconnects, systems, the CUDA programming model, and a suite of software libraries, SDKs, application frameworks, and services. Ticker: NVDA. Filing: 2023_10K
Passage: The company provides a complete, end-to

In [16]:
#Obtaining a response from the gemini-2.0-flash model from Google.
answer = client.models.generate_content(
    model = "gemini-2.0-flash",
    contents = prompt
)

#Presenting the answer via Markdown
Markdown(answer.text)

Alright, let's break down NVIDIA's CUDA programming model and when they created the GPU!

NVIDIA's CUDA programming model basically unlocked the parallel processing power of GPUs, making them useful for all sorts of general computing tasks. In other words, it allows developers to use the GPU for things beyond just graphics! NVIDIA also invented the GPU itself back in 1999 and it is a key component of NVIDIA's accelerated computing platform, which also includes GPUs, DPUs, interconnects, systems, and a whole bunch of software tools like libraries and SDKs.


### Embeddings - 2nd Method
* Since there are some restrictions on the number of API calls for the embedding function, we can use the sentence transformers in python to create our own embeddings.
* These embeddings can then be saved via pickle, which can be used later, even when the session is completed, to avoid recalculations.

In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Creating the embeddings
embeddings = model.encode(list(df['train']), batch_size=64, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/110 [00:00<?, ?it/s]

In [18]:
#Saving the embeddings using pickle in the directory
import pickle
with open("embeddings.pkl","wb") as file:
  pickle.dump(embeddings, file)

In [19]:
#Creating a collection and adding the newly created embeddings
db2 = chroma_client.get_or_create_collection(name = "financial_db2")
db2.add(documents = list(df['train']), ids = [str(i) for i in range(len(df))], embeddings = embeddings)

In [20]:
db2.count()

6990

In [21]:
len(df)

6990

In [22]:
#Querying the collection
query = "When did NVIDIA invent the GPU?"
results = db2.query(query_texts = query, n_results = 5)
all_passages = results['documents']
Markdown(results["documents"][0][0])

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 81.2MiB/s]


NVIDIA invented the GPU in 1999. Ticker: NVDA. Filing: 2023_10K

In [23]:
#Generating final result using Gemini
query_oneline = query.replace("\n"," ")

#Crafting the prompt
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
Be sure to break down complicated concepts and strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}"""

#Add the retrieved passages to the prompt
for passages in all_passages[0]:
  passage_oneline = passages.replace("\n"," ")
  prompt += f"\nPassage: {passage_oneline}"

print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
Be sure to break down complicated concepts and strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

QUESTION: When did NVIDIA invent the GPU?
Passage: NVIDIA invented the GPU in 1999. Ticker: NVDA. Filing: 2023_10K
Passage: NVIDIA leveraged its GPU architecture to create platforms for scientific computing, AI, data science, AV, robotics, metaverse, and 3D internet applications. Ticker: NVDA. Filing: 2023_10K
Passage: NVIDIA's GPU computing platform enhances productivity and introduces new capabilities for critical workflows in fields such as design and manufacturing, and digital content creation. Ticker: NVDA. Filing: 2023_10K
Passage: NVIDIA's GPUs and software are used for automation in various industries incl

In [24]:
#Creating a function to take user queries and returning the results using Gemini
def query_gemini(query:str, n_results:int):
  #Obtaining the search results for user query based on number of results decided by the user
  results = db2.query(query_texts = query, n_results = n_results)
  all_passages = results['documents']


  #Crafting the prompt
  query_oneline = query.replace("\n"," ")

  prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below.
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
  Be sure to break down complicated concepts and strike a friendly and converstional tone. If the passage is irrelevant to the answer, you may ignore it.

  QUESTION: {query_oneline}"""

  #Add the retrieved passages to the prompt
  for passages in all_passages[0]:
    passage_oneline = passages.replace("\n"," ")
    prompt += f"\nPassage: {passage_oneline}"

  #Generating answer
  answer = client.models.generate_content(
      model = "gemini-2.0-flash",
      contents = prompt
  )

  print(answer.text)

In [25]:
query_gemini("Provide the financial results for Nvidia in 2023", n_results = 5)

Okay, I can help you with that! According to the document, Nvidia's Graphics revenue decreased in fiscal year 2023 compared to 2022 due to lower sales to partners, which was influenced by global economic conditions and COVID-19 related disruptions in China affecting gaming demand. Also, end customer sales for NVIDIA's products in China were negatively impacted by lockdowns during fiscal year 2023, and this impact may continue if lockdowns return.

