In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
import google.generativeai as genai
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
google_api_key = os.environ["GOOGLE_API_KEY"]

In [3]:
genai.configure(api_key=google_api_key)
gemini_client = genai.GenerativeModel('gemini-1.5-flash')

chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [4]:
collection = chroma_client.create_collection(name="ocean_1")

InternalError: Collection [ocean_1] already exists

In [5]:
file_dir = here("data/for_upload/ocean_1.csv")
df = pd.read_csv(file_dir, nrows=5)

In [6]:
df

Unnamed: 0,id,type,cast_id,latitude,longitude,date,time,depth,temperature,salinity,pressure,dissolved_oxygen,region
0,CTD_Southern_Indian_Ocean_1_20240317_1,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,0.0,28.534,34.93,0.99,6.97,Southern Indian Ocean
1,CTD_Southern_Indian_Ocean_1_20240317_2,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,92.4,19.654,34.476,10.22,5.78,Southern Indian Ocean
2,CTD_Southern_Indian_Ocean_1_20240317_3,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,297.1,9.895,34.671,30.7,4.45,Southern Indian Ocean
3,CTD_Southern_Indian_Ocean_1_20240317_4,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,284.0,9.079,34.414,29.36,4.7,Southern Indian Ocean
4,CTD_Southern_Indian_Ocean_1_20240317_5,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,484.0,7.459,34.615,49.41,2.82,Southern Indian Ocean


NOTE: Process in chunks if dataset is big.

In [16]:
import google.generativeai as genai
import os

# Configure the API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

docs = []
metadatas = []
ids = []
embeddings = []

for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    
    # Use Google's embedding model
    response = genai.embed_content(
        model="models/embedding-001",  # Google's text embedding model
        content=output_str,
        task_type="retrieval_document"  # or "retrieval_query", "semantic_similarity", etc.
    )
    
    embeddings.append(response['embedding'])
    docs.append(output_str)
    metadatas.append({"source": "ocean_1"})
    ids.append(f"id{index}")

In [17]:
docs

['id: CTD_Southern_Indian_Ocean_1_20240317_1,\ntype: CTD_Cast,\ncast_id: CTD_Southern_Indian_Ocean_1_20240317,\nlatitude: -29.4225,\nlongitude: 80.3244,\ndate: 2024-03-17,\ntime: 02:11:34,\ndepth: 0.0,\ntemperature: 28.534,\nsalinity: 34.93,\npressure: 0.99,\ndissolved_oxygen: 6.97,\nregion: Southern Indian Ocean,\n',
 'id: CTD_Southern_Indian_Ocean_1_20240317_2,\ntype: CTD_Cast,\ncast_id: CTD_Southern_Indian_Ocean_1_20240317,\nlatitude: -29.4225,\nlongitude: 80.3244,\ndate: 2024-03-17,\ntime: 02:11:34,\ndepth: 92.4,\ntemperature: 19.654,\nsalinity: 34.476,\npressure: 10.22,\ndissolved_oxygen: 5.78,\nregion: Southern Indian Ocean,\n',
 'id: CTD_Southern_Indian_Ocean_1_20240317_3,\ntype: CTD_Cast,\ncast_id: CTD_Southern_Indian_Ocean_1_20240317,\nlatitude: -29.4225,\nlongitude: 80.3244,\ndate: 2024-03-17,\ntime: 02:11:34,\ndepth: 297.1,\ntemperature: 9.895,\nsalinity: 34.671,\npressure: 30.7,\ndissolved_oxygen: 4.45,\nregion: Southern Indian Ocean,\n',
 'id: CTD_Southern_Indian_Ocean_1_2

In [18]:
print(metadatas)
print(ids)

[{'source': 'ocean_1'}, {'source': 'ocean_1'}, {'source': 'ocean_1'}, {'source': 'ocean_1'}, {'source': 'ocean_1'}]
['id0', 'id1', 'id2', 'id3', 'id4']


In [19]:
embeddings[0][:10]

[-0.0053378525,
 -0.0152130965,
 -0.06014165,
 0.0003721821,
 0.03362041,
 0.031531416,
 0.0503816,
 -0.016563421,
 0.029114323,
 0.027425164]

In [21]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

NameError: name 'collection' is not defined

Verify the vectorDB creation

In [22]:
print("Number of vectors in vectordb:", collection.count())

NameError: name 'collection' is not defined

### RAG

In [8]:
import google.generativeai as genai

In [10]:
model_name = "gemini-1.5-flash"
google_api_key = os.environ["GOOGLE_API_KEY"]

In [11]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
gemini_client = genai.GenerativeModel('gemini-1.5-flash')

**Perform similarity search**

In [12]:
query_texts = "what's the average depth of id"
response = genai.embed_content(
    model="models/embedding-001",
    content=query_texts
)
query_embeddings = response['embedding']

**Load the chromaDB collection for vector search**

In [23]:
vectordb = chroma_client.get_collection(name="ocean_1")
vectordb.count()

0

In [16]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

Pass the results to an LLM

In [17]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [18]:
import google.generativeai as genai
import os

# Configure the API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Create the model
model = genai.GenerativeModel('gemini-1.5-flash')  # or 'gemini-pro-vision' for multimodal

# Generate response
response = model.generate_content(messages)

KeyError: "Unable to determine the intended type of the `dict`. For `Content`, a 'parts' key is expected. For `Part`, either an 'inline_data' or a 'text' key is expected. For `Blob`, both 'mime_type' and 'data' keys are expected. However, the provided dictionary has the following keys: ['role', 'content']"

In [19]:
response.choices[0].message.content

AttributeError: 'dict' object has no attribute 'choices'

**Fact check**

In [20]:
df

Unnamed: 0,id,type,cast_id,latitude,longitude,date,time,depth,temperature,salinity,pressure,dissolved_oxygen,region
0,CTD_Southern_Indian_Ocean_1_20240317_1,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,0.0,28.534,34.93,0.99,6.97,Southern Indian Ocean
1,CTD_Southern_Indian_Ocean_1_20240317_2,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,92.4,19.654,34.476,10.22,5.78,Southern Indian Ocean
2,CTD_Southern_Indian_Ocean_1_20240317_3,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,297.1,9.895,34.671,30.7,4.45,Southern Indian Ocean
3,CTD_Southern_Indian_Ocean_1_20240317_4,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,284.0,9.079,34.414,29.36,4.7,Southern Indian Ocean
4,CTD_Southern_Indian_Ocean_1_20240317_5,CTD_Cast,CTD_Southern_Indian_Ocean_1_20240317,-29.4225,80.3244,2024-03-17,02:11:34,484.0,7.459,34.615,49.41,2.82,Southern Indian Ocean
