In [2]:
# Install required packages
!pip install -U langchain langchain-google-genai pandas langchain_chroma langchain-community chromadb

Collecting langchain-google-genai
  Using cached langchain_google_genai-2.1.10-py3-none-any.whl.metadata (7.2 kB)
Collecting pandas
  Using cached pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting langchain_chroma
  Using cached langchain_chroma-0.2.5-py3-none-any.whl.metadata (1.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Using cached chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Using cached langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 

In [None]:
# Commnd is Used to clear the Whole chroma DB
#!rm -rf chroma_db

In [7]:
import os
os.environ["GOOGLE_API_KEY"] = "Your Key"


In [8]:
import os
import pandas as pd
import numpy as np
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from geopy.geocoders import Nominatim
from geopy.distance import geodesic


class SimpleRagChain:
    def __init__(self, retriever, df):
        self.retriever = retriever
        self.df = df  # keep raw data for lat/lon matching

    def invoke(self, inputs):
        user_input = inputs.get("input", "")

        # Case 1: User entered lat/lon
        if "," in user_input:
            try:
                lat, lon = map(float, user_input.split(","))
                return self.find_nearest(lat, lon)
            except Exception:
                pass

        # Case 2: User entered city name
        geolocator = Nominatim(user_agent="rag_location_app")
        location = geolocator.geocode(user_input)
        if location:
            return self.find_nearest(location.latitude, location.longitude)

        # Case 3: Fallback to vector similarity search
        docs = self.retriever.get_relevant_documents(user_input)
        context = "\n".join(doc.page_content for doc in docs)
        return {"answer": context}

    def find_nearest(self, lat, lon, top_k=3):
        """Find top-k nearest locations using raw lat/lon in dataset"""

        # Compute distances (temporary float conversion)
        self.df["distance"] = self.df.apply(
            lambda row: geodesic(
                (lat, lon),
                (float(row["lat"]), float(row["lon"]))  # only for distance math
            ).km,
            axis=1
        )

        nearest = self.df.nsmallest(top_k, "distance")

        results = []
        for _, row in nearest.iterrows():
            results.append({
                "lat": str(row["lat"]),   # ✅ return original string
                "lon": str(row["lon"]),   # ✅ return original string
                "year": row["year"],
                "month": row["month"],
                "sst": row["sst"],
                "poc": row["poc"],
                "pic": row["pic"],
                "aot_862": row["aot_862"],
                "chlor_a": row["chlor_a"],
                "Kd_490": row["Kd_490"],
                "distance_km": f"{row['distance']:.2f}"
            })
        return {"answer": results}


def build_vectorstore(parquet_path, persist_directory="./chroma_db", batch_size=1000):
    """Build Chroma vectorstore in batches for large Parquet files"""
    embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

    # Reload if exists
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        print("📂 Loading existing vector store...")
        vectorstore = Chroma(persist_directory=persist_directory,
                             embedding_function=embeddings)
        df = pd.read_parquet(parquet_path)   # <-- read from parquet
        df = df.astype(str)                  # <-- ensure all values are strings
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
        return SimpleRagChain(retriever, df)

    print("🔄 Creating new vector store...")
    df = pd.read_parquet(parquet_path)   # <-- read from parquet
    df = df.astype(str)                  # <-- keep raw string values
    vectorstore = Chroma(embedding_function=embeddings,
                         persist_directory=persist_directory)

    # Batch insert
    for i, batch in enumerate(np.array_split(df, len(df)//batch_size + 1)):
        docs = [
            Document(
                page_content=(
                    f"Location: ({row['lat']}, {row['lon']}), "
                    f"Year: {row['year']}, Month: {row['month']}, "
                    f"SST: {row['sst']}°C, POC: {row['poc']}, PIC: {row['pic']}, "
                    f"AOT_862: {row['aot_862']}, Chlor_a: {row['chlor_a']}, "
                    f"Kd_490: {row['Kd_490']}"
                ),
                metadata={
                    'lat': row['lat'],
                    'lon': row['lon'],
                    'year': row['year'],
                    'month': row['month'],
                    'sst': row['sst'],
                    'poc': row['poc'],
                    'pic': row['pic'],
                    'aot_862': row['aot_862'],
                    'chlor_a': row['chlor_a'],
                    'Kd_490': row['Kd_490']
                }
            )
            for _, row in batch.iterrows()
        ]
        vectorstore.add_documents(docs)
        vectorstore.persist()
        print(f"✅ Batch {i+1} stored ({len(batch)} rows)")

    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    return SimpleRagChain(retriever, df)


In [11]:
# Commnd to create a Vector Data Store
rag_chain = build_vectorstore("Gold_Data/cleaned_merged_gold.parquet")

📂 Loading existing vector store...


In [12]:
# Normal NPL Query
print(rag_chain.invoke({"input": "Mumbai"}))

{'answer': [{'lat': '17.97916', 'lon': '130.02084', 'year': '2025', 'month': '8', 'sst': '28.615', 'poc': '33.799904', 'pic': '0.00029999777', 'aot_862': '0.1113', 'chlor_a': '0.033684835', 'Kd_490': '0.030599998', 'distance_km': '6008.13'}, {'lat': '18.020826', 'lon': '130.0625', 'year': '2025', 'month': '8', 'sst': '28.859999', 'poc': '31.199905', 'pic': '0.00018999778', 'aot_862': '0.0927', 'chlor_a': '0.03573832', 'Kd_490': '0.028199999', 'distance_km': '6011.60'}, {'lat': '17.60416', 'lon': '129.97917', 'year': '2025', 'month': '8', 'sst': '29.24', 'poc': '31.399904', 'pic': '0.00010199778', 'aot_862': '0.14469999', 'chlor_a': '0.045011766', 'Kd_490': '0.0276', 'distance_km': '6011.70'}]}


In [13]:
print(rag_chain.invoke({"input": "-24.104176, 151.8125"}))

{'answer': [{'lat': '-24.104176', 'lon': '151.8125', 'year': '2025', 'month': '8', 'sst': '20.22', 'poc': '118.59991', 'pic': '0.0009879977', 'aot_862': '0.0758', 'chlor_a': '0.4571914', 'Kd_490': '0.0674', 'distance_km': '0.00'}, {'lat': '-24.104176', 'lon': '151.85417', 'year': '2025', 'month': '8', 'sst': '20.455', 'poc': '97.59991', 'pic': '0.0006819978', 'aot_862': '0.062299997', 'chlor_a': '0.35746726', 'Kd_490': '0.0576', 'distance_km': '4.24'}, {'lat': '-24.06251', 'lon': '151.8125', 'year': '2025', 'month': '8', 'sst': '20.279999', 'poc': '93.799904', 'pic': '0.0006339978', 'aot_862': '0.0603', 'chlor_a': '0.32980758', 'Kd_490': '0.0552', 'distance_km': '4.61'}]}
