In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re
from sqlglot import parse_one
import json
import os

# ===== IMPORTANT: Use new Chroma package =====
# Run: pip install langchain-chroma
try:
    from langchain_chroma import Chroma
    print("‚úÖ Using langchain-chroma (recommended)")
except ImportError:
    print("‚ö†Ô∏è langchain-chroma not found. Installing...")
    print("Please run: pip install langchain-chroma")
    print("Then restart the kernel and run again.")
    raise ImportError("Please install langchain-chroma: pip install langchain-chroma")

# ===== CONFIGURATION =====
csv_path = r"D:\RAG Task\Client_Shipment_Orders.csv"
db_path = r"D:\RAG Task\orders.duckdb"
persist_directory = r"D:\RAG Task\chroma_db"  # Separate folder for Chroma
collection_name = "shipment_orders"

VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]

# ===== STEP 1: DATABASE SETUP =====
print("üìä Setting up DuckDB database...")
with duckdb.connect(db_path) as con:
    con.execute(f"""
        CREATE TABLE IF NOT EXISTS orders AS
        SELECT * FROM read_csv_auto('{csv_path}');
    """)
print("‚úÖ Database ready!")

# ===== STEP 2: VECTOR STORE SETUP (with proper caching) =====
def setup_vector_store():
    """Setup or load existing vector store to avoid recreation"""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    # Create persist directory if it doesn't exist
    os.makedirs(persist_directory, exist_ok=True)
    
    # Check if vector store already exists
    chroma_db_exists = os.path.exists(os.path.join(persist_directory, "chroma.sqlite3"))
    
    if chroma_db_exists:
        try:
            print("üîç Loading existing vector store...")
            vector_store = Chroma(
                persist_directory=persist_directory,
                embedding_function=embeddings,
                collection_name=collection_name
            )
            
            # Verify it has data
            collection_count = vector_store._collection.count()
            if collection_count > 0:
                print(f"‚úÖ Loaded existing vector store with {collection_count} documents")
                return vector_store, embeddings
            else:
                print("‚ö†Ô∏è Vector store is empty, will create new one")
        except Exception as e:
            print(f"‚ö†Ô∏è Error loading vector store: {e}")
            print("Creating new vector store...")
    else:
        print("üìù No existing vector store found, creating new one...")
    
    # Create new vector store
    print("üìÑ Loading CSV and generating chunks...")
    df = pd.read_csv(csv_path)
    print(f"   Loaded {len(df)} rows")
    
    # Generate chunks
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
        
        # Progress indicator
        if (index + 1) % 100 == 0:
            print(f"   Processed {index + 1}/{len(df)} rows...", end='\r')
    
    print(f"\n‚úÖ Generated {len(chunks)} labelled chunks")
    
    documents = [Document(page_content=chunk) for chunk in chunks]
    
    print("üîÑ Creating embeddings and vector store (this may take a few minutes)...")
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory,
        collection_name=collection_name,
    )
    print("‚úÖ Vector store created and persisted!")
    
    # Clean up memory
    del df, chunks, documents
    
    return vector_store, embeddings

# Setup vector store
vector_store, embeddings = setup_vector_store()

# ===== STEP 3: SETUP RETRIEVERS =====
print("üîß Setting up retrievers...")

# For BM25, load a sample or use lightweight approach
print("   Loading documents for BM25...")
with duckdb.connect(db_path) as con:
    # Use LIMIT to avoid memory issues
    sample_size = min(1000, vector_store._collection.count())
    df_sample = con.execute(f"SELECT * FROM orders LIMIT {sample_size}").fetchdf()

chunks_for_bm25 = []
for index, row in df_sample.iterrows():
    text = " ".join([f"{col}: {row[col]}" for col in df_sample.columns])
    chunks_for_bm25.append(text)

documents_for_bm25 = [Document(page_content=chunk) for chunk in chunks_for_bm25]

vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
keyword_retriever = BM25Retriever.from_documents(documents_for_bm25)
keyword_retriever.k = 5

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, keyword_retriever],
    weights=[0.6, 0.4]
)

# Clean up temporary data
del df_sample, chunks_for_bm25, documents_for_bm25

print("‚úÖ Retrievers ready!")

# ===== LLM SETUP =====
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# ===== SQL SYSTEM PROMPT =====
sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.

TABLE INFORMATION:
Table: orders  
Columns: Order ID, Client Name, Email, Contact Number, Origin, Destination,
Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ), Total Price (‚Çπ),
Order Date, Delivery Date, Status

INSTRUCTIONS:
1. Generate SQL queries ONLY for structured/numeric filters
2. Use exact column names with quotes: "Client Name", "Total Price (‚Çπ)"
3. Return ONLY the SQL query - no markdown, comments, or explanations
4. SAFETY: Only SELECT queries allowed (no INSERT, UPDATE, DELETE, DROP, etc.)
5. Use LOWER() for case-insensitive text matching
6. For "highest" queries, use subqueries to avoid grouping errors

Example for "highest purchase":
SELECT "Client Name", "Total Price (‚Çπ)"
FROM orders
WHERE "Total Price (‚Çπ)" = (SELECT MAX("Total Price (‚Çπ)") FROM orders);
"""

# ===== STATE GRAPH =====
class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str

def intent_node(state: dict):
    question = state["question"]
    intent_prompt = f"""
    Classify this query about an Orders dataset as ONE word:
    - numeric: count, sum, statistics, filters
    - semantic: descriptive, interpretive questions
    - hybrid: both numeric and semantic
    - greet: greetings
    - ignore: unrelated
    
    Question: {question}
    Answer with ONE word only:
    """
    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Intent: {intent}")
    state["intent"] = intent
    return state

def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state

def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about orders. Please ask something related."
    return state

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks[:5]])  # Limit context
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error in retrieval: {e}"
        print(f"‚ùå Retriever error: {e}")
    return state

def sql_validator_node(state: dict):
    sql_query = state.get("sql_query", "").strip()
    
    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries allowed."
        return state
    
    forbidden = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden):
        state["validation_error"] = "‚ùå Unsafe SQL operation detected."
        return state
    
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column: '{match}'."
            return state
    
    state["validation_error"] = None
    return state

def duckdb_node(state: dict):
    query = state["question"]
    try:
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()
        sql_query = sql_query.replace("```sql", "").replace("```", "").replace("`", "").strip()
        
        print(f"üß† Generated SQL: {sql_query}")
        state["sql_query"] = sql_query
        
        # Validate syntax
        try:
            parse_one(sql_query)
        except Exception as e:
            state["answer"] = f"‚ö†Ô∏è SQL syntax error: {e}"
            return state
        
        # Validate safety
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state
        
        # Execute
        with duckdb.connect(db_path) as con:
            result_df = con.execute(sql_query).fetchdf()
        
        if result_df.empty:
            state["answer"] = "No matching records found."
            return state
        
        result_text = result_df.to_string(index=False)
        summary_prompt = f"User asked: {query}\n\nSQL result:\n{result_text}\n\nProvide a clear, natural explanation:"
        answer = llm.invoke(summary_prompt).content.strip()
        state["answer"] = answer
        
    except Exception as e:
        state["answer"] = f"Error executing query: {str(e)}"
        print(f"‚ùå DuckDB error: {e}")
    
    return state

def hybrid_node(state: dict):
    try:
        question = state["question"]
        print(f"üîÄ Processing hybrid query: {question}")
        
        split_prompt = f"""Split this query into numeric and semantic parts.
        Return ONLY valid JSON:
        {{"numeric": "numeric question part", "semantic": "semantic question part", "dependent": true/false}}
        
        Question: {question}
        """
        
        split_result = llm.invoke(split_prompt).content.strip()
        split_result = split_result.replace("```json", "").replace("```", "").strip()
        parsed = json.loads(split_result)
        
        numeric_part = parsed.get("numeric", "").strip()
        semantic_part = parsed.get("semantic", "").strip()
        dependent = parsed.get("dependent", False)
        
        print(f"   Numeric: {numeric_part}")
        print(f"   Semantic: {semantic_part}")
        print(f"   Dependent: {dependent}")
        
        # Get numeric answer
        numeric_answer = ""
        if numeric_part:
            temp = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_answer = duckdb_node(temp).get("answer", "")
        
        # Get semantic answer (if independent)
        semantic_answer = ""
        if semantic_part and not dependent:
            temp = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_answer = retriever_node(temp).get("answer", "")
        
        # Combine results
        combine_prompt = f"""Original question: {question}
        
        Numeric insight: {numeric_answer or 'None'}
        Semantic insight: {semantic_answer or 'None'}
        
        Provide a unified, natural answer that addresses the original question:
        """
        
        final_answer = llm.invoke(combine_prompt).content.strip()
        state["answer"] = final_answer
        
    except Exception as e:
        state["answer"] = f"Error in hybrid processing: {str(e)}"
        print(f"‚ùå Hybrid error: {e}")
    
    return state

# ===== BUILD GRAPH =====
print("üèóÔ∏è Building state graph...")
graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")
graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()
print("‚úÖ Graph compiled successfully!")

# ===== MAIN LOOP =====
if __name__ == "__main__":
    print("\n" + "="*60)
    print("üöÄ Smart Query Assistant Ready!")
    print("="*60)
    print("Type 'exit' or 'quit' to end the session.\n")
    
    while True:
        try:
            user_input = input("You: ").strip()
            if user_input.lower() in ["exit", "quit"]:
                print("Assistant: Goodbye üëã")
                break
            
            if not user_input:
                continue
            
            result = app.invoke({"question": user_input})
            print(f"\nAssistant: {result['answer']}\n")
            
        except KeyboardInterrupt:
            print("\n\nAssistant: Goodbye üëã")
            break
        except Exception as e:
            print(f"\n‚ùå Error: {e}\n")

In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re
from openai import OpenAI
import os
from sqlglot import parse_one
import json

# ‚úÖ Step 1: Define paths
csv_path = r"D:\RAG Task\Client_Shipment_Orders.csv"
db_path = r"D:\RAG Task\orders.duckdb"
faiss_index_path = r"D:\RAG Task\faiss_index"

# ‚úÖ Step 2: Create or update database table (runs only once)
with duckdb.connect(db_path) as con:
    # Create the 'orders' table if not already present
    con.execute(f"""
        CREATE TABLE IF NOT EXISTS orders AS
        SELECT * FROM read_csv_auto('{csv_path}');
    """)
    # Optional: Refresh data if you've updated CSV
    # con.execute(f"DELETE FROM orders; INSERT INTO orders SELECT * FROM read_csv_auto('{csv_path}');")

# ‚úÖ Step 3: Load DataFrame safely for local use
with duckdb.connect(db_path) as con:
    df = con.execute("SELECT * FROM orders").fetchdf()

# ‚úÖ Step 4: Clean / normalize text columns
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip().str.title()

sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.

----------------------------------
TABLE INFORMATION
----------------------------------
Table name: orders  
Columns and their meanings:
- Order ID: Unique identifier for each order (text)
- Client Name: Name of the customer who placed the order (text)
- Email: Email address of the client (text)
- Contact Number: Client's contact phone number (text)
- Origin: Source location of the shipment (text)
- Destination: Delivery location of the shipment (text)
- Product Name: Name of the purchased product (text)
- Category: Product category (e.g., Furniture, Decor, Appliances)
- Material: Material type of the product (e.g., Wood, Glass, Metal)
- Color: Color of the product (text)
- Quantity: Number of units ordered (integer)
- Unit Price (‚Çπ): Price per unit in INR (numeric)
- Total Price (‚Çπ): Total order price in INR (numeric)
- Order Date: Date when the order was placed (date)
- Delivery Date: Date when the order was delivered (date)
- Status: Order status (e.g., Delivered, Pending, Cancelled)

----------------------------------
SAMPLE DATA
----------------------------------
ORD0001 | Kara Mata | chelsea75@yahoo.com | 038.830.3017x8206 | Port Mariamouth | Cohenmouth | Wall Art | Decor | Glass | Grey | 15 | 29878 | 448170 | 2025-05-13 | 2025-06-02 | Cancelled  
ORD0002 | Jesse Williams | ccasey@barrett.info | (426)505-2355 | Tamaraview | Lake Rickyport | Bed | Furniture | Glass | Brown | 30 | 1507 | 45210 | 2025-10-04 | 2025-11-03 | Cancelled  

----------------------------------
INSTRUCTIONS
----------------------------------
1. Generate SQL queries **only** for structured or numeric filters.
   Examples:
   - Total sales, sum, count, average, quantity, or price-based questions  
   - Filtering by columns such as Status, Category, Material, or Color  
   - Date-based filters (e.g., orders after 2025-05-01)

2. **Do NOT** generate queries based on subjective or descriptive logic
   such as reasons for cancellation, customer feedback, or preferences.
   These are handled separately by a semantic retriever system.

3. Use the correct table name `orders` and column names **exactly as shown**.
   Preserve proper case and special characters (e.g., `"Total Price (‚Çπ)"`).

4. Never hallucinate columns, tables, or calculations that do not exist.

5. Return **only** the SQL query ‚Äî no markdown, comments, or explanations.

6. **SAFETY RULES ‚Äî STRICTLY ENFORCED**
   - Never modify or delete data.
   - Do not use or suggest `UPDATE`, `DELETE`, `INSERT`, `DROP`, `TRUNCATE`, or `ALTER`.
   - Do not create or alter schemas, indexes, or tables.
   - Only allow read-only operations:  
     `SELECT`, `WHERE`, `GROUP BY`, `ORDER BY`, `LIMIT`, and aggregate functions (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`).

7. **Case Handling:**  
   When matching text values (like product or status), use `LOWER()` to make comparisons case-insensitive.  
   Example:  
   `WHERE LOWER("Product Name") = LOWER('Toilet Bowl')`

8. **Special Handling ‚Äî Highest or Maximum Queries:**  
   If the user asks questions like  
   *"Who made the highest purchase?"*,  
   *"Which client has the largest total?"*, or  
   *"Top buyer / maximum purchase amount"*,  
   use this pattern to avoid grouping errors:
   ```sql
   SELECT "Client Name", "Total Price (‚Çπ)"
   FROM orders
   WHERE "Total Price (‚Çπ)" = (
       SELECT MAX("Total Price (‚Çπ)") FROM orders
   );
"""

#  ROW-WISE LABELLED CHUNK GENERATION
def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(csv_path)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")

# ‚úÖ Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# ‚úÖ Create or load FAISS vector store
if os.path.exists(faiss_index_path):
    print("‚úÖ Loading existing FAISS index...")
    try:
        vector_store = FAISS.load_local(
            faiss_index_path, 
            embeddings,
            allow_dangerous_deserialization=True
        )
        print("‚úÖ FAISS index loaded successfully!")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading FAISS index: {e}")
        print("‚úÖ Creating new FAISS index...")
        vector_store = FAISS.from_documents(documents, embeddings)
        vector_store.save_local(faiss_index_path)
        print("‚úÖ New FAISS index created and saved!")
else:
    print("‚úÖ Creating new FAISS index...")
    vector_store = FAISS.from_documents(documents, embeddings)
    # Create directory if it doesn't exist
    os.makedirs(faiss_index_path, exist_ok=True)
    vector_store.save_local(faiss_index_path)
    print("‚úÖ FAISS index created and saved!")

# ‚úÖ Set up retrievers
vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 5

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, keyword_retriever],
    weights=[0.6, 0.4]
)

class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


def intent_node(state: dict):
    """Use LLM to classify query intent based on the Orders dataset"""
    question = state["question"]

    intent_prompt = f"""
    You are an intent classifier for user questions over an **Orders dataset**.
    The table contains the following columns:
    Order ID, Client Name, Email, Contact Number, Origin, Destination,
    Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ),
    Total Price (‚Çπ), Order Date, Delivery Date, Status.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if the query involves structured, measurable, or count-based data.
       Examples:
       - "How many orders are pending?"
       - "What is the total sales amount?"
       - "Show the average unit price."
       - "Count the number of clients."
       - "List orders where quantity > 10.
       - "Who made the highest purchase?"
       - "names of customers who ordered bed "

    2. "semantic" ‚Üí if the query involves descriptive or text-based attributes
       such true semantic questions, i.e., ones that are descriptive, interpretive, or text-based, not solvable with SQL filters or numbers.
T       These rely on understanding meaning, patterns, or unstructured context rather than column values.
       Examples:
      - Which customers look like regular buyers of furniture?
      - Which products are most suitable for modern homes?
      - What type of products are popular in Port Mariamouth?
      
    3. "hybrid" ‚Üí if the query mixes both numeric and descriptive components.
       Examples:
       - What is the total count of clients who bought curtains and Which destination cities frequently receive d√©cor orders?? 

    4. "greet" ‚Üí greetings or conversational openers.
       Examples:
       - "Hi", "Hello", "Good morning", "Hey there"

    5. "ignore" ‚Üí unrelated or irrelevant to order data.
       Examples:
       - "Tell me a joke", "What's the time?", "Who is the CEO?"

    Question: {question}

    Return only one word:
    numeric, semantic, hybrid, greet, or ignore.
    """

    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]

def sql_validator_node(state: dict):
    """Validates generated SQL to ensure it's safe and valid for DuckDB execution"""
    sql_query = state.get("sql_query", "").strip()
    print(f"üß© Validating SQL query: {sql_query}")

    # ‚úÖ 1. Ensure it's a SELECT query
    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries are allowed."
        return state

    # ‚ùå 2. Block dangerous operations
    forbidden_keywords = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden_keywords):
        state["validation_error"] = (
            f"‚ùå Unsafe SQL operation detected. "
            f"Keywords like {', '.join(forbidden_keywords)} are not allowed."
        )
        return state

    # ‚úÖ 3. Validate columns used in SQL (prevent hallucinated fields)
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column name used: '{match}'."
            return state

    # ‚úÖ 4. Passed all checks
    state["validation_error"] = None
    print("‚úÖ SQL validation passed.")
    return state

def duckdb_node(state: dict):
    """Handles numeric/structured questions ‚Äî validates SQL with SQLGlot before executing"""
    query = state["question"]

    try:
        # üß† Step 1: Ask LLM to generate SQL
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # üßπ Step 2: Clean LLM formatting
        sql_query = (
            sql_query.replace("```sql", "")
                     .replace("```", "")
                     .replace("`", "")
                     .replace("SQL:", "")
                     .strip()
        )

        print(f"\nüß† Generated SQL query:\n{sql_query}")
        state["sql_query"] = sql_query

        # ‚úÖ Step 3: Syntax validation using SQLGlot
        try:
            parse_one(sql_query)
            print("‚úÖ SQLGlot syntax check passed.")
        except Exception as parse_err:
            state["answer"] = f"‚ö†Ô∏è SQL syntax error detected: {parse_err}"
            return state

        # ‚úÖ Step 4: Custom SQL safety validation
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state

        # ‚úÖ Step 5: Safe execution inside a local DuckDB context
        try:
            with duckdb.connect(db_path) as con:
                result_df = con.execute(sql_query).fetchdf()

        except Exception as exec_err:
            state["answer"] = f"‚ö†Ô∏è SQL execution failed: {exec_err}"
            return state

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # ‚úÖ Step 6: Convert results to plain text
        result_text = result_df.to_string(index=False)

        # ‚úÖ Step 7: Generate human-readable summary
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a natural, clear explanation of these results.
        Avoid skipping rows or making assumptions.
        """
        answer = llm.invoke(summary_prompt).content.strip()
        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state


def hybrid_node(state: dict):
    """Handles hybrid queries (numeric + semantic), with LLM-driven dependency classification and validation."""
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        # 1Ô∏è‚É£ STEP 1 ‚Äî Query Splitting (same as before)
        split_prompt = f"""
        You are a **query-splitting assistant** for a hybrid SQL-semantic system.

        Analyze the following user query and output valid JSON with:
        - "numeric": SQL-based measurable part (count, sum, max, etc.)
        - "semantic": descriptive/contextual part
        - "dependent": true if semantic depends on numeric; false if both are independent.

        Examples:
        1Ô∏è‚É£ Dependent ‚Üí "Who made the highest purchase and how much was it?"
        {{
            "numeric": "Find the highest purchase amount and corresponding client name.",
            "semantic": "Who made that purchase and what was the total price?",
            "dependent": true
        }}

        2Ô∏è‚É£ Independent ‚Üí "What is the total revenue and list all product categories?"
        {{
            "numeric": "What is the total revenue?",
            "semantic": "List all product categories.",
            "dependent": false
        }}

        ---
        Question: {question}
        """
        split_result = llm.invoke(split_prompt).content.strip()
        print("üß© Raw Split Result:", split_result)

        try:
            split_result = split_result.replace("```json", "").replace("```", "").strip()
            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
            dependent_from_split = parsed.get("dependent", False)
        except json.JSONDecodeError:
            numeric_part, semantic_part, dependent_from_split = "", "", False
            print("‚ö†Ô∏è Invalid JSON in split, skipping split-based dependency flag.")

        print(f"‚úÖ Parsed numeric part: {numeric_part}")
        print(f"‚úÖ Parsed semantic part: {semantic_part}")
        print(f"üîó Dependency from split: {dependent_from_split}")

        # 2Ô∏è‚É£ STEP 2 ‚Äî LLM-Guided Dependency Classifier (production-safe)
        dependency_prompt = f"""
        You are a query dependency classifier.

        Determine if the following sub-questions are logically dependent:
        - Numeric sub-question: {numeric_part}
        - Semantic sub-question: {semantic_part}

        Return JSON:
        {{
            "dependent": true/false,
            "reason": "brief explanation"
        }}

        Example:
        {{
            "dependent": true,
            "reason": "The semantic sub-question refers to the same entity as the numeric result."
        }}
        """
        dependency_result = llm.invoke(dependency_prompt).content.strip()
        print("\nüß† Dependency Classifier Output:", dependency_result)

        try:
            dependency_result = dependency_result.replace("```json", "").replace("```", "").strip()
            dep_parsed = json.loads(dependency_result)
            dependent = dep_parsed.get("dependent", dependent_from_split)
        except json.JSONDecodeError:
            dependent = dependent_from_split
            print("‚ö†Ô∏è Could not parse dependency JSON ‚Äî using split result fallback.")

        print(f"‚úÖ Final dependency decision: {dependent}")

        # 3Ô∏è‚É£ STEP 3 ‚Äî Numeric Answer (SQL via DuckDB)
        numeric_answer = ""
        if numeric_part:
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print("\nüßÆ NUMERIC RESULT (SQL):")
            print(numeric_answer)
        else:
            print("‚ö†Ô∏è No numeric part found.")

        # 4Ô∏è‚É£ STEP 4 ‚Äî Semantic Answer (Retriever / Hybrid LLM)
        semantic_answer = ""
        if semantic_part:
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print("\nüí¨ SEMANTIC RESULT (Retriever):")
            print(semantic_answer)
        else:
            print("‚ö†Ô∏è No semantic part found.")

        # 5Ô∏è‚É£ STEP 5 ‚Äî Validation (only for independent queries)
        if not dependent and numeric_answer and semantic_answer:
            validation_prompt = f"""
            Validate factual consistency between two independent results.

            Numeric result:
            {numeric_answer}

            Semantic result:
            {semantic_answer}

            If both address unrelated aspects (e.g., total vs list), mark as consistent.

            Reply only in JSON:
            {{
                "is_consistent": true/false,
                "issues": "briefly describe inconsistencies"
            }}
            """
            validation = llm.invoke(validation_prompt).content.strip()
            print("\nüîç Validation result:", validation)

            try:
                validation_json = json.loads(validation.replace("```json", "").replace("```", "").strip())
                consistent = validation_json.get("is_consistent", True)
            except:
                consistent = True  # fallback

            if not consistent:
                print("‚ö†Ô∏è Inconsistency detected ‚Äî regenerating semantic answer with numeric context.")
                retry_prompt = f"""
                User question: {question}

                Numeric (SQL) result:
                {numeric_answer}

                Previous semantic result:
                {semantic_answer}

                Generate a consistent unified answer that aligns with the numeric facts.
                """
                semantic_answer = llm.invoke(retry_prompt).content.strip()
                print("\n‚ôªÔ∏è RETRIED SEMANTIC ANSWER:")
                print(semantic_answer)
        else:
            print("‚úÖ Dependent query detected ‚Äî skipping independent validation step.")

        # 6Ô∏è‚É£ STEP 6 ‚Äî Combine Results (context-aware merge)
        print("\nüß† Combining numeric & semantic results...")
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight:
        {numeric_answer or "None"}

        Semantic insight:
        {semantic_answer or "None"}

        Combine both into a single, concise, factual answer.
        If dependent, mention both the entity and its numeric value clearly.
        """
        combined_response = llm.invoke(combine_prompt)
        final_answer = getattr(combined_response, "content", str(combined_response)).strip()

        state["answer"] = final_answer

    except Exception as e:
        print(f"‚ùå Error in hybrid_node: {str(e)}")
        state["answer"] = f"Error in hybrid_node: {str(e)}"

    return state


graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        try:
            user_input = input("You: ").strip()
            print(f"You: {user_input}")
            if user_input.lower() in ["exit", "quit"]:
                print("Assistant: Goodbye üëã")
                break

            result = app.invoke({"question": user_input})
            print(f"Assistant: {result['answer']}\n")
        
        except KeyboardInterrupt:
            print("\nAssistant: Goodbye üëã")
            break
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            print("Please try again with a different question.\n")

‚úÖ Generated 50 labelled chunks for embeddings.
‚úÖ Loading existing FAISS index...
‚úÖ FAISS index loaded successfully!

üöÄ Smart Query Assistant ready! Type 'exit' to quit.

You: orders that were delivered on october month and list the product names
üéØ Detected Intent: hybrid

üîÄ [Hybrid Node] Received question ‚Üí orders that were delivered on october month and list the product names
üß© Raw Split Result: {
    "numeric": "Find orders that were delivered in October.",
    "semantic": "List the product names associated with those orders.",
    "dependent": false
}
‚úÖ Parsed numeric part: Find orders that were delivered in October.
‚úÖ Parsed semantic part: List the product names associated with those orders.
üîó Dependency from split: False

üß† Dependency Classifier Output: {
    "dependent": true,
    "reason": "The semantic sub-question requires the results of the numeric sub-question to identify which product names to list, as it specifically pertains to the orders del