In [3]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
import google.generativeai as genai
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [4]:
google_api_key = os.environ["GOOGLE_API_KEY"]

In [5]:
genai.configure(api_key=google_api_key)
gemini_client = genai.GenerativeModel('gemini-1.5-flash')

chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [None]:
collection = chroma_client.create_collection(name="ocean")

InternalError: Collection [ocean_2] already exists

In [8]:
file_dir = here("data/for_upload/ocean_2.csv")
df = pd.read_csv(file_dir, nrows=5)

In [16]:
df.head()

Unnamed: 0,Location,Depth,Temperature,Salinity,Pressure,Dissolved Oxygen,Sea Level,Tsunami Risk Level,Conductivity
0,Paradip Coast,2628.04,15.98,31.67,359135.72,7.49,1.085,Medium,32.03698
1,Haldia Coast,6655.49,1.55,31.68,754228.57,5.2,1.316,Low,54.468067
2,Daman Coast,5126.64,10.1,30.37,604248.38,2.95,1.387,Low,51.024729
3,Kochi Coast,4194.62,4.03,37.36,512817.22,6.18,1.42,Low,37.096336
4,Kochi Coast,1100.57,1.9,36.64,209290.92,6.76,0.543,High,34.865852


NOTE: Process in chunks if dataset is big.

In [30]:
import google.generativeai as genai
import os
import pandas as pd
import time

# --------------------------
# 1. Load your dataset
# --------------------------
file_dir = here("data/for_upload/ocean_2.csv")
df = pd.read_csv(file_dir, nrows=5)
# --------------------------
# 2. Configure Gemini API
# --------------------------
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# --------------------------
# 3. Prepare rows as text
# --------------------------
MAX_CHARS = 8000  # keep text under safe token limit
contents = []

for _, row in df.iterrows():
    output_str = "".join([f"{col}: {row[col]},\n" for col in df.columns])
    contents.append(output_str[:MAX_CHARS])

# --------------------------
# 4. Embed in safe batches
# --------------------------
batch_size = 10   # well under API limit (max = 16)
embeddings = []
docs = []
metadatas = []
ids = []

for i in range(0, len(contents), batch_size):
    batch = contents[i:i+batch_size]

    response = genai.embed_content(
        model="models/embedding-001",
        content=batch,
        task_type="retrieval_document"
    )

    for j, emb in enumerate(response['embeddings']):
        embeddings.append(emb['embedding'])
        docs.append(batch[j])
        metadatas.append({"source": "ocean_2"})
        ids.append(f"id{i+j}")

    # optional: add delay if dataset is large
    time.sleep(0.2)

print(f"✅ Generated {len(embeddings)} embeddings successfully!")


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0 [violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerProjectPerModel-FreeTier"
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
]

In [None]:
docs

[]

In [None]:
print(metadatas)
print(ids)

[]
[]


In [None]:
embeddings[0][:10]

IndexError: list index out of range

In [None]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

ValueError: Expected Embeddings to be non-empty list or numpy array, got [] in add.

Verify the vectorDB creation

In [None]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 0


### RAG

In [None]:
import google.generativeai as genai

In [None]:
model_name = "gemini-1.5-flash"
google_api_key = os.environ["GOOGLE_API_KEY"]

In [None]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
gemini_client = genai.GenerativeModel('gemini-1.5-flash')

**Perform similarity search**

In [None]:
query_texts = "what's the average depth of id"
response = genai.embed_content(
    model="models/embedding-001",
    content=query_texts
)
query_embeddings = response['embedding']

ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0 [violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerProjectPerModel-FreeTier"
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
]

**Load the chromaDB collection for vector search**

In [None]:
vectordb = chroma_client.get_collection(name="ocean_1")
vectordb.count()

0

In [None]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

Pass the results to an LLM

In [None]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [None]:
import google.generativeai as genai
import os

# Configure the API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Create the model
model = genai.GenerativeModel('gemini-1.5-flash')  # or 'gemini-pro-vision' for multimodal

# Generate response
response = model.generate_content(messages)

KeyError: "Unable to determine the intended type of the `dict`. For `Content`, a 'parts' key is expected. For `Part`, either an 'inline_data' or a 'text' key is expected. For `Blob`, both 'mime_type' and 'data' keys are expected. However, the provided dictionary has the following keys: ['role', 'content']"

In [None]:
response.choices[0].message.content

AttributeError: 'dict' object has no attribute 'choices'

**Fact check**

In [None]:
df

Unnamed: 0,id,type,cast_id,latitude,longitude,date,time,depth,temperature,salinity,pressure,dissolved_oxygen,region
0,CTD_Southern_Indian_Ocean_1_20240317_1,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,0.0,28.534,34.93,0.99,6.97,Southern Indian Ocean
1,CTD_Southern_Indian_Ocean_1_20240317_2,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,92.4,19.654,34.476,10.22,5.78,Southern Indian Ocean
2,CTD_Southern_Indian_Ocean_1_20240317_3,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,297.1,9.895,34.671,30.7,4.45,Southern Indian Ocean
3,CTD_Southern_Indian_Ocean_1_20240317_4,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,284.0,9.079,34.414,29.36,4.7,Southern Indian Ocean
4,CTD_Southern_Indian_Ocean_1_20240317_5,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,484.0,7.459,34.615,49.41,2.82,Southern Indian Ocean
