In [16]:
import os
from dotenv import load_dotenv
load_dotenv()

import logging
from typing import TypedDict, List

import pandas as pd
import folium

# LangChain / LLM / Embedding
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
from pydantic import BaseModel, Field

# Pinecone / LangChain-Pinecone
import pinecone
from langchain_pinecone import PineconeVectorStore

# LangGraph
from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver



In [17]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"))
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.getenv("GEMINI_API_KEY"))
chat_for_classify = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=os.getenv("GEMINI_API_KEY")
    )

In [None]:
import pandas as pd
import os
import gc
import time
import logging
from dotenv import load_dotenv

from pinecone import pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# ---------------------------
# Logging setup
# ---------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# ---------------------------
# Load environment variables
# ---------------------------
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX", "ingres-index")

if not PINECONE_API_KEY:
    raise RuntimeError("Please set PINECONE_API_KEY in your environment or .env file")

# ---------------------------
# 1. Load Data from CSVs
# ---------------------------
try:
    df1 = pd.read_csv("data/dwlr_realtime_data.csv")
    df2 = pd.read_csv("data/groundwater_quality_data.csv")
    df3 = pd.read_csv("data/groundwater_trends_2015_2023.csv")
    df4 = pd.read_csv("data/ingres_assessment_units_2023.csv")
    logging.info("Successfully loaded CSV files.")
except FileNotFoundError as e:
    logging.error(f"Error loading CSV file: {e}. Please ensure 'data/' folder exists with CSV files.")
    exit()

# ---------------------------
# 2. Convert DataFrames → Text Documents
# ---------------------------
def df_to_text(df, name):
    docs = []
    for _, row in df.iterrows():
        text = f"{name} Record: " + " | ".join([f"{col}: {row[col]}" for col in df.columns if pd.notna(row[col])])
        docs.append(text)
    return docs

docs1 = df_to_text(df1, "Realtime")
docs2 = df_to_text(df2, "Quality")
docs3 = df_to_text(df3, "Trend")
docs4 = df_to_text(df4, "Assessment")
all_docs = docs1 + docs2 + docs3 + docs4
gc.collect()

# ---------------------------
# 3. Chunk Documents
# ---------------------------
logging.info("Chunking documents...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = splitter.create_documents(all_docs)
logging.info(f"Created {len(documents)} chunks.")

# ---------------------------
# 4. Initialize Embeddings
# ---------------------------
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("Please set GEMINI_API_KEY in your environment or .env file")

embedding_function = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", 
    google_api_key=GEMINI_API_KEY
)

# Test embedding to get dimension
test_vector = embedding_function.embed_query("hello world")
DIMENSION = len(test_vector)
logging.info(f"Embedding dimension: {DIMENSION}")

# ---------------------------
# 5. Initialize Pinecone client
# ---------------------------
pc = Pinecone(api_key=PINECONE_API_KEY)

# Get existing indexes
existing_indexes = [idx.name for idx in pc.list_indexes()]

if PINECONE_INDEX_NAME not in existing_indexes:
    logging.info(f"Creating Pinecone index '{PINECONE_INDEX_NAME}' with dim={DIMENSION}...")
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    # Wait for index to be ready
    while True:
        idx_list = pc.list_indexes()
        if any(idx.name == PINECONE_INDEX_NAME for idx in idx_list):
            break
        logging.info("Waiting for index to be ready...")
        time.sleep(2)
    logging.info(f"Index '{PINECONE_INDEX_NAME}' is ready.")
else:
    logging.info(f"Index '{PINECONE_INDEX_NAME}' already exists. Connecting...")

# ---------------------------
# 6. Create VectorStore & Upsert
# ---------------------------
logging.info("Upserting documents into Pinecone...")
vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embedding_function,
    index_name=PINECONE_INDEX_NAME
)
logging.info("Documents successfully upserted.")

# ---------------------------
# 7. Create Retriever
# ---------------------------
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})
logging.info("Retriever is ready.")

print("\n--- Process Complete ---")
print(f"Pinecone index '{PINECONE_INDEX_NAME}' contains your embeddings.")
print("The 'retriever' object is ready for use in your RAG pipeline.")


ImportError: cannot import name 'Pinecone' from 'pinecone' (unknown location)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from Pinecone import pinecone, ServerlessSpec
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# -----------------------------
# Load API keys
# -----------------------------
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# -----------------------------
# Setup embeddings
# -----------------------------
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=GEMINI_API_KEY
)

# -----------------------------
# Setup Pinecone
# -----------------------------
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "groundwater-index"

# Create index if not exists
if index_name not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=768,  # Gemini embedding dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

# -----------------------------
# Load CSVs and prepare text
# -----------------------------
folder = "data"
files = [
    "dwlr_realtime_data.csv",
    "groundwater_quality_data.csv",
    "groundwater_trends_2015_2023.csv",
    "ingres_assessment_units_2023.csv"
]

all_texts = []
for file in files:
    df = pd.read_csv(os.path.join(folder, file))
    for i, row in df.iterrows():
        text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
        all_texts.append((f"{file}_{i}", text, {"source": file}))

# -----------------------------
# Generate embeddings & upsert
# -----------------------------
vectors = []
for _id, text, meta in all_texts:
    vector = embeddings.embed_query(text)
    vectors.append((_id, vector, meta))

# Batch insert (Pinecone prefers batches)
index.upsert(vectors[:100])  # you can loop in chunks if too large

print(f"Inserted {len(vectors)} vectors into Pinecone!")


ModuleNotFoundError: No module named 'Pinecone'

In [None]:
class AgentState(TypedDict):
    messages: List[BaseMessage]
    documents: List[Document]
    on_topic: str
    rephrased_question: str
    proceed_to_generate: bool
    rephrase_count: int
    question: HumanMessage
    needs_gis: bool
    gis_data: dict
    map_html: str

In [None]:
class GradeQuestion(BaseModel):
    score: str = Field(description='Is user asking INGRES data? Yes/No')

class GISRequest(BaseModel):
    needs_gis: str = Field(description='Need GIS map? Yes/No')
    location_data: str = Field(description='Extracted location info')

class GradeDocument(BaseModel):
    score: str = Field(description='Relevant? Yes/No')

In [None]:
def question_rewriter(state: AgentState):
    state.update({"documents":[], "on_topic":"", "rephrased_question":"", "proceed_to_generate":False,
                  "rephrase_count":0, "needs_gis":False, "gis_data":{}, "map_html":""})
    state.setdefault("messages", [])
    if state["question"] not in state["messages"]:
        state["messages"].append(state["question"])
    if len(state["messages"]) > 1:
        conv = state["messages"][:-1]
        q = state["question"].content
        msgs = [SystemMessage(content="Rephrase for INGRES retrieval.")] + conv + [HumanMessage(content=q)]
        try:
            new_q = ChatOpenAI(model='gpt-4o-mini').invoke(ChatPromptTemplate.from_messages(msgs).format()).content.strip()
        except Exception:
            new_q = q
        state["rephrased_question"] = new_q
    else:
        state["rephrased_question"] = state["question"].content
    return state

def question_classifier(state: AgentState):
    msgs = [SystemMessage(content="Is this about INGRES groundwater data? Yes/No"), HumanMessage(content=state["rephrased_question"])]
    try:
        result = ChatOpenAI(model='gpt-4o-mini').with_structured_output(GradeQuestion).invoke(ChatPromptTemplate.from_messages(msgs).format())
        state["on_topic"] = result.score.strip()
    except Exception:
        q = state.get('rephrased_question','').lower()
        state["on_topic"] = 'yes' if any(k in q for k in ['groundwater', 'ingres', 'ground water', 'water table', 'gw']) else 'no'
    return state

def on_topic_router(state: AgentState):
    return 'retrieve' if state['on_topic'].lower() == 'yes' else 'off_topic_response'

def retrieve(state: AgentState):
    try:
        hits = retriever.get_relevant_documents(state['rephrased_question'])
        state['documents'] = hits
    except Exception as e:
        logger.exception('Retriever failed: %s', e)
        state['documents'] = []
    return state

def retrieval_grader(state: AgentState):
    relevant = []
    for doc in state.get('documents',[]):
        try:
            msgs = [SystemMessage(content='Relevant to INGRES query?'), HumanMessage(content=f"{state['rephrased_question']}\n\n{doc.page_content}")]
            r = ChatOpenAI(model='gpt-4o-mini').with_structured_output(GradeDocument).invoke(ChatPromptTemplate.from_messages(msgs).format())
            if r.score.strip().lower() == 'yes':
                relevant.append(doc)
        except Exception:
            if any(tok in doc.page_content.lower() for tok in ['ground','ph','tds','district','year']):
                relevant.append(doc)
    state['documents'] = relevant
    state['proceed_to_generate'] = bool(relevant)
    return state

def proceed_router(state: AgentState):
    if state.get('proceed_to_generate'):
        return 'generate_answer'
    return 'cannot_answer' if state.get('rephrase_count',0) >= 2 else 'refine_question'

def refine_question(state: AgentState):
    if state.get('rephrase_count',0) >= 2:
        return state
    msgs = [SystemMessage(content='Refine INGRES query slightly'), HumanMessage(content=state['rephrased_question'])]
    try:
        new_q = ChatOpenAI(model='gpt-4o-mini').invoke(ChatPromptTemplate.from_messages(msgs).format()).content.strip()
    except Exception:
        new_q = state['rephrased_question'] + ' (please be more specific)'
    state['rephrased_question'] = new_q
    state['rephrase_count'] = state.get('rephrase_count',0) + 1
    return state

def generate_answer(state: AgentState):
    context_text = '\n\n---\n\n'.join([d.page_content for d in state.get('documents',[])])
    history = '\n'.join([m.content for m in state.get('messages',[])])
    try:
        res = ChatOpenAI(model='gpt-4o-mini').invoke({'history': history, 'context': context_text, 'question': state['rephrased_question']})
        content = res.content.strip()
    except Exception as e:
        logger.exception('RAG chain failed: %s', e)
        content = 'I found the following relevant excerpts:\n' + '\n---\n'.join([d.page_content[:400] for d in state.get('documents',[])])
    state.setdefault('messages',[]).append(AIMessage(content=content))
    return state

def gis_classifier(state: AgentState):
    last = state['messages'][-1].content if state.get('messages') else ''
    msgs = [SystemMessage(content='Need GIS map? Yes/No'), HumanMessage(content=f"{state['rephrased_question']}\n\n{last}")]
    try:
        r = ChatOpenAI(model='gpt-4o-mini').with_structured_output(GISRequest).invoke(ChatPromptTemplate.from_messages(msgs).format())
        state['needs_gis'] = r.needs_gis.strip().lower() == 'yes'
        state['gis_data'] = {'location': r.location_data.strip()} if state['needs_gis'] else {}
    except Exception:
        q = (state.get('rephrased_question','') + ' ' + last).lower()
        state['needs_gis'] = any(w in q for w in ['map','location','district','lat','lon','show me','plot'])
        state['gis_data'] = {'location': state.get('rephrased_question','')} if state['needs_gis'] else {}
    return state

def generate_gis_map(state: AgentState):
    if not state.get('needs_gis'):
        return state
    pts = [
        {"lat":15.3, "lon":75.7, "val":200},
        {"lat":19.0, "lon":77.0, "val":250}
    ]
    m = folium.Map(location=[22,79], zoom_start=5)
    for p in pts:
        folium.CircleMarker([p['lat'], p['lon']], radius=5 + p['val']*0.01, popup=str(p['val'])).add_to(m)
    state['map_html'] = m._repr_html_()
    last = state['messages'][-1]
    state['messages'][-1] = AIMessage(content=last.content + '\n\n[GIS Map Attached]')
    return state

def cannot_answer(state: AgentState):
    state.setdefault('messages',[]).append(AIMessage(content="I'm sorry, I can't find that."))
    return state

def off_topic_response(state: AgentState):
    state.setdefault('messages',[]).append(AIMessage(content="I'm sorry, I cannot answer this."))
    return state

# ---------------------------
# Build LangGraph workflow
# ---------------------------
cp = MemorySaver()
wf = StateGraph(AgentState)

nodes = [
    'question_rewriter','question_classifier','off_topic_response',
    'retrieve','retrieval_grader','generate_answer',
    'refine_question','cannot_answer',
    'gis_classifier','generate_gis_map'
]
for n in nodes:
    wf.add_node(n, globals()[n])

wf.set_entry_point('question_rewriter')
wf.add_edge('question_rewriter','question_classifier')
wf.add_conditional_edges('question_classifier', on_topic_router, {'retrieve':'retrieve','off_topic_response':'off_topic_response'})
wf.add_edge('retrieve','retrieval_grader')
wf.add_conditional_edges('retrieval_grader', proceed_router, {'generate_answer':'generate_answer','refine_question':'refine_question','cannot_answer':'cannot_answer'})
wf.add_edge('refine_question','retrieve')
wf.add_edge('generate_answer','gis_classifier')
wf.add_conditional_edges('gis_classifier', lambda s: 'generate_gis_map' if s.get('needs_gis') else END, {'generate_gis_map':'generate_gis_map', END:END})
wf.add_edge('generate_gis_map', END)
wf.add_edge('cannot_answer', END)
wf.add_edge('off_topic_response', END)

graph = wf.compile(checkpointer=cp)


In [None]:
# To use this code, run your full script first to compile the graph.
# Then you can run the following lines in your environment.

# Create an interactive loop to chat with the graph
# For a single session, you can use a fixed thread_id.
# For a real application, you would generate a unique ID per user.
thread_id = "my-ingres-chatbot-session-1"

while True:
    user_query = input("You: ")
    if user_query.lower() in ["exit", "quit", "q"]:
        print("Goodbye!")
        break

    try:
        # Pass the 'configurable' dictionary with the thread_id
        final_state = graph.invoke(
            {"question": HumanMessage(content=user_query)},
            config={"configurable": {"thread_id": thread_id}}
        )
        
        last_message = final_state['messages'][-1]
        print(f"Chatbot: {last_message.content}")
        
        if final_state.get('map_html'):
            print("\n[GIS map generated]\n")
            with open("gis_map.html", "w") as f:
                f.write(final_state['map_html'])
            print("Map saved as gis_map.html. Open it in your browser to view.")

    except Exception as e:
        print(f"An error occurred: {e}")
        break