In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re
import duckdb
import pandas as pd
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
# df = duckdb.read_csv(csv_path).df()

csv_path = "Client_Shipment_Orders.csv"

# Load CSV safely
df = pd.read_csv(csv_path)

# Normalize text columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.title().str.strip()

# Reconnect DuckDB safely
con = duckdb.connect()
con.execute("CREATE OR REPLACE TABLE orders AS SELECT * FROM df")

# Use a safe path for Chroma
persist_directory = r"D:\RAG_Task"  



sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.

----------------------------------
TABLE INFORMATION
----------------------------------
Table name: orders  
Columns and their meanings:
- Order ID: Unique identifier for each order (text)
- Client Name: Name of the customer who placed the order (text)
- Email: Email address of the client (text)
- Contact Number: Client's contact phone number (text)
- Origin: Source location of the shipment (text)
- Destination: Delivery location of the shipment (text)
- Product Name: Name of the purchased product (text)
- Category: Product category (e.g., Furniture, Decor, Appliances)
- Material: Material type of the product (e.g., Wood, Glass, Metal)
- Color: Color of the product (text)
- Quantity: Number of units ordered (integer)
- Unit Price (‚Çπ): Price per unit in INR (numeric)
- Total Price (‚Çπ): Total order price in INR (numeric)
- Order Date: Date when the order was placed (date)
- Delivery Date: Date when the order was delivered (date)
- Status: Order status (e.g., Delivered, Pending, Cancelled)

----------------------------------
SAMPLE DATA
----------------------------------
ORD0001 | Kara Mata | chelsea75@yahoo.com | 038.830.3017x8206 | Port Mariamouth | Cohenmouth | Wall Art | Decor | Glass | Grey | 15 | 29878 | 448170 | 2025-05-13 | 2025-06-02 | Cancelled  
ORD0002 | Jesse Williams | ccasey@barrett.info | (426)505-2355 | Tamaraview | Lake Rickyport | Bed | Furniture | Glass | Brown | 30 | 1507 | 45210 | 2025-10-04 | 2025-11-03 | Cancelled  

----------------------------------
INSTRUCTIONS
----------------------------------
1. Generate SQL queries **only** for structured or numeric filters.
   Examples:
   - Total sales, sum, count, average, quantity, or price-based questions  
   - Filtering by columns such as Status, Category, Material, or Color  
   - Date-based filters (e.g., orders after 2025-05-01)

2. **Do NOT** generate queries based on subjective or descriptive logic
   such as reasons for cancellation, customer feedback, or preferences.
   These are handled separately by a semantic retriever system.

3. Use the correct table name `orders` and column names **exactly as shown**.
   Preserve proper case and special characters (e.g., `"Total Price (‚Çπ)"`).

4. Never hallucinate columns, tables, or calculations that do not exist.

5. Return **only** the SQL query ‚Äî no markdown, comments, or explanations.

6. **SAFETY RULES ‚Äî STRICTLY ENFORCED**
   - Never modify or delete data.
   - Do not use or suggest `UPDATE`, `DELETE`, `INSERT`, `DROP`, `TRUNCATE`, or `ALTER`.
   - Do not create or alter schemas, indexes, or tables.
   - Only allow read-only operations:  
     `SELECT`, `WHERE`, `GROUP BY`, `ORDER BY`, `LIMIT`, and aggregate functions (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`).

7. **Case Handling:**  
   When matching text values (like product or status), use `LOWER()` to make comparisons case-insensitive.  
   Example:  
   `WHERE LOWER("Product Name") = LOWER('Toilet Bowl')`

8. **Special Handling ‚Äî Highest or Maximum Queries:**  
   If the user asks questions like  
   *‚ÄúWho made the highest purchase?‚Äù*,  
   *‚ÄúWhich client has the largest total?‚Äù*, or  
   *‚ÄúTop buyer / maximum purchase amount‚Äù*,  
   use this pattern to avoid grouping errors:
   ```sql
   SELECT "Client Name", "Total Price (‚Çπ)"
   FROM orders
   WHERE "Total Price (‚Çπ)" = (
       SELECT MAX("Total Price (‚Çπ)") FROM orders
   );
"""

#  ROW-WISE LABELLED CHUNK GENERATION

def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(csv_path)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_directory = "D:\RAG Task"
collection_name = "shipment_orders"

vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name=collection_name,
)


vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 5

hybrid_retriever = EnsembleRetriever(
    retrievers= [vector_retriever,keyword_retriever],
    weights=[0.6,0.4]
)

class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


def intent_node(state: dict):
    """Use LLM to classify query intent based on the Orders dataset"""
    question = state["question"]

    intent_prompt = f"""
    You are an intent classifier for user questions over an **Orders dataset**.
    The table contains the following columns:
    Order ID, Client Name, Email, Contact Number, Origin, Destination,
    Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ),
    Total Price (‚Çπ), Order Date, Delivery Date, Status.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if the query involves structured, measurable, or count-based data.
       Examples:
       - "How many orders are pending?"
       - "What is the total sales amount?"
       - "Show the average unit price."
       - "Count the number of clients."
       - "List orders where quantity > 10.
       - "Who made the highest purchase?"
       - "names of customers who ordered bed "

    2. "semantic" ‚Üí if the query involves descriptive or text-based attributes
       such true semantic questions, i.e., ones that are descriptive, interpretive, or text-based, not solvable with SQL filters or numbers.
T       These rely on understanding meaning, patterns, or unstructured context rather than column values.
       Examples:
      - Which customers look like regular buyers of furniture?
      - Which products are most suitable for modern homes?
      - What type of products are popular in Port Mariamouth?
      
    3. "hybrid" ‚Üí if the query mixes both numeric and descriptive components.
       Examples:
       - What is the total count of clients who bought curtains and Which destination cities frequently receive d√©cor orders?? 

    4. "greet" ‚Üí greetings or conversational openers.
       Examples:
       - "Hi", "Hello", "Good morning", "Hey there"

    5. "ignore" ‚Üí unrelated or irrelevant to order data.
       Examples:
       - "Tell me a joke", "What's the time?", "Who is the CEO?"

    Question: {question}

    Return only one word:
    numeric, semantic, hybrid, greet, or ignore.
    """

    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state



llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]
from sqlglot import parse_one
import re

def sql_validator_node(state: dict):
    """Validates generated SQL to ensure it's safe and valid for DuckDB execution"""
    sql_query = state.get("sql_query", "").strip()
    print(f"üß© Validating SQL query: {sql_query}")

    # ‚úÖ 1. Ensure it's a SELECT query
    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries are allowed."
        return state

    # ‚ùå 2. Block dangerous operations
    forbidden_keywords = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden_keywords):
        state["validation_error"] = (
            f"‚ùå Unsafe SQL operation detected. "
            f"Keywords like {', '.join(forbidden_keywords)} are not allowed."
        )
        return state

    # ‚úÖ 3. Validate columns used in SQL (prevent hallucinated fields)
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column name used: '{match}'."
            return state

    # ‚úÖ 4. Passed all checks
    state["validation_error"] = None
    print("‚úÖ SQL validation passed.")
    return state

def duckdb_node(state: dict):
    """Handles numeric/structured questions ‚Äî validates SQL with SQLGlot before executing"""
    query = state["question"]

    try:
        # üß† Step 1: Ask LLM to generate SQL
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # üßπ Step 2: Clean LLM formatting
        sql_query = (
            sql_query.replace("```sql", "")
                     .replace("```", "")
                     .replace("`", "")
                     .replace("SQL:", "")
                     .strip()
        )

        print(f"\nüß† Generated SQL query:\n{sql_query}")
        state["sql_query"] = sql_query

        # ‚úÖ Step 3: Syntax validation using SQLGlot
        try:
            parse_one(sql_query)
            print("‚úÖ SQLGlot syntax check passed.")
        except Exception as parse_err:
            state["answer"] = f"‚ö†Ô∏è SQL syntax error detected: {parse_err}"
            return state

        # ‚úÖ Step 4: Custom SQL safety validation
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state

        # ‚úÖ Step 5: Safe execution inside a local DuckDB context
        try:
            with duckdb.connect() as temp_con:
                temp_con.register("orders", df)
                result_df = temp_con.execute(sql_query).fetchdf()
        except Exception as exec_err:
            state["answer"] = f"‚ö†Ô∏è SQL execution failed: {exec_err}"
            return state

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # ‚úÖ Step 6: Convert results to plain text
        result_text = result_df.to_string(index=False)

        # ‚úÖ Step 7: Generate human-readable summary
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a natural, clear explanation of these results.
        Avoid skipping rows or making assumptions.
        """
        answer = llm.invoke(summary_prompt).content.strip()
        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state

def hybrid_node(state: dict):
    """Handles hybrid queries (numeric + semantic), distinguishes dependent vs independent sub-questions."""
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        # 1Ô∏è‚É£ Strict split prompt with dependency detection
        split_prompt = f"""
            You are a **strict query-splitting assistant** for a hybrid SQL-semantic reasoning system.

            Your task is to analyze a user's natural question and split it into two precise sub-questions:
            - "numeric": a part that involves quantitative or SQL-based reasoning (e.g., count, sum, max, min, average, total, etc.)
            - "semantic": a descriptive or entity-based part that involves understanding names, descriptions, or meanings.
            Also, specify whether these two parts are **dependent** (the semantic part relies on the numeric answer)
            or **independent** (they can be answered separately).

            Be extremely careful:
            - DO NOT merge both questions into one.
            - DO NOT make assumptions or hallucinate missing data.
            - Your output MUST always be a valid JSON object with exactly three keys: "numeric", "semantic", and "dependent".
            - Always detect if the question indirectly involves SQL-style numeric reasoning even if phrased differently (e.g., ‚Äúbiggest‚Äù, ‚Äúmost‚Äù, ‚Äúhighest‚Äù, ‚Äútop‚Äù, etc.).

            ---
            ### ‚öôÔ∏è SQL Integration Note
            If the question is **dependent** (e.g., "Who made the highest purchase and how much was it?"),
            this will later be used by a SQL generator. So:
            - Ensure the "numeric" question extracts the quantitative aspect (like ‚Äúhighest purchase amount‚Äù).
            - Ensure the "semantic" question clearly describes the related entity (like ‚Äúwho made that purchase‚Äù).
            - The SQL generator will then produce a **single combined SQL query** that includes both numeric and descriptive fields 
            (e.g., client name + total price) to answer both parts together.

            ---
            ### ‚úÖ Examples

            **Example 1 ‚Äî Dependent query**
            User question: "Who made the highest purchase and how much was the amount?"
            Output:
            {{
            "numeric": "What is the highest purchase amount?",
            "semantic": "Who made that highest purchase?",
            "dependent": true
            }}

            **Example 2 ‚Äî Dependent query**
            User question: "Which product had the highest sales and what was the total revenue for it?"
            Output:
            {{
            "numeric": "Which product had the highest sales?",
            "semantic": "What was the total revenue for that product?",
            "dependent": true
            }}

            **Example 3 ‚Äî Independent query**
            User question: "How many orders were placed, and who are the top 5 customers?"
            Output:
            {{
            "numeric": "How many orders were placed?",
            "semantic": "Who are the top 5 customers?",
            "dependent": false
            }}

            **Example 4 ‚Äî Independent query**
            User question: "What is the total revenue and list all product categories?"
            Output:
            {{
            "numeric": "What is the total revenue?",
            "semantic": "List all product categories.",
            "dependent": false
            }}

            ---
            Now process this question and return **valid JSON only** (no explanations):

            Question: {question}
            """


        split_result = llm.invoke(split_prompt).content.strip()
        print("üß© Raw Split Result:", split_result)

        import json
        try:
            split_result = split_result.replace("```json", "").replace("```", "").strip()
            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
            dependent = parsed.get("dependent", False)
        except json.JSONDecodeError:
            numeric_part, semantic_part, dependent = "", "", False
            print("‚ö†Ô∏è Invalid JSON in split, skipping.")

        print(f"‚úÖ Parsed numeric part: {numeric_part}")
        print(f"‚úÖ Parsed semantic part: {semantic_part}")
        print(f"üîó Dependency detected: {dependent}")

        # 2Ô∏è‚É£ Get numeric answer from DuckDB
        numeric_answer = ""
        if numeric_part:
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print("\nüßÆ NUMERIC RESULT (from SQL):")
            print(numeric_answer)
        else:
            print("‚ö†Ô∏è No numeric part found.")

        # 3Ô∏è‚É£ Get semantic answer from Retriever
        semantic_answer = ""
        if semantic_part:
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print("\nüí¨ SEMANTIC RESULT (from Retriever):")
            print(semantic_answer)
        else:
            print("‚ö†Ô∏è No semantic part found.")

        # 4Ô∏è‚É£ Validation: only if dependent
        if dependent and numeric_answer and semantic_answer:
            validation_prompt = f"""
            The following query is DEPENDENT ‚Äî the semantic part relies on the numeric SQL result.
            Check if the semantic result aligns factually with the numeric SQL result.
            Focus on consistency between numbers, names, or facts.

            Numeric result:
            {numeric_answer}

            Semantic result:
            {semantic_answer}

            Reply with JSON:
            {{
                "is_consistent": true/false,
                "issues": "describe discrepancies briefly"
            }}
            """
            validation = llm.invoke(validation_prompt).content.strip()
            print("\nüîç Validation result:", validation)

            try:
                validation_json = json.loads(validation.replace("```json", "").replace("```", "").strip())
                consistent = validation_json.get("is_consistent", True)
            except:
                consistent = True  # fallback

            # Retry if inconsistent
            if not consistent:
                print("‚ö†Ô∏è Semantic answer inconsistent with SQL ‚Äî retrying with numeric context...")
                retry_prompt = f"""
                The user asked: {question}

                The numeric SQL result says:
                {numeric_answer}

                The previous semantic answer was:
                {semantic_answer}

                Please re-generate a consistent, unified answer that aligns with the numeric facts.
                """
                semantic_answer = llm.invoke(retry_prompt).content.strip()
                print("\n‚ôªÔ∏è RETRIED SEMANTIC ANSWER:")
                print(semantic_answer)
        else:
            print("‚úÖ Independent or single-part query ‚Äî skipping validation.")

        # 5Ô∏è‚É£ Combine answers
        print("\nüß† Combining numeric & semantic results...")
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight (SQL result):
        {numeric_answer or "None"}

        Semantic insight (retrieved/explained result):
        {semantic_answer or "None"}

        Combine both into one clear, factual, and natural-language answer.
        Avoid SQL or technical terms.
        """
        combined_response = llm.invoke(combine_prompt)
        final_answer = getattr(combined_response, "content", str(combined_response)).strip()

        # Assign final answer
        state["answer"] = final_answer

    except Exception as e:
        print(f"‚ùå Error in hybrid_node: {str(e)}")
        state["answer"] = f"Error in hybrid_node: {str(e)}"

    return state


# def hybrid_node(state: dict):
#     """Handles hybrid queries by splitting into numeric & semantic sub-questions with validation & retry."""
#     try:
#         question = state["question"]
#         print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

#         # 1Ô∏è‚É£ Split query into numeric & semantic parts
#         split_prompt = f"""
#         Split the user query into numeric and semantic parts.
#         - Numeric: can be answered by SQL (count, filter, sum, max, etc.)
#         - Semantic: descriptive or entity-based part
#         Return valid JSON only:
#         {{
#             "numeric": "numeric sub-question",
#             "semantic": "semantic sub-question"
#         }}
#         Question: {question}
#         """
#         split_result = llm.invoke(split_prompt).content.strip()
#         print("üß© Raw Split Result:", split_result)

#         import json
#         try:
#             split_result = split_result.replace("```json", "").replace("```", "").strip()
#             parsed = json.loads(split_result)
#             numeric_part = parsed.get("numeric", "").strip()
#             semantic_part = parsed.get("semantic", "").strip()
#         except json.JSONDecodeError:
#             numeric_part, semantic_part = "", ""
#             print("‚ö†Ô∏è Invalid JSON in split, skipping.")
#         print(f"‚úÖ Parsed numeric part: {numeric_part}")
#         print(f"‚úÖ Parsed semantic part: {semantic_part}")

#         # 2Ô∏è‚É£ Get numeric answer from DuckDB
#         numeric_answer = ""
#         if numeric_part:
#             temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
#             numeric_state = duckdb_node(temp_state)
#             numeric_answer = numeric_state.get("answer", "")
#             print("\nüßÆ NUMERIC RESULT (from SQL):")
#             print(numeric_answer)
#         else:
#             print("‚ö†Ô∏è No numeric part found.")

#         # 3Ô∏è‚É£ Get semantic answer from Retriever
#         semantic_answer = ""
#         if semantic_part:
#             temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
#             semantic_state = retriever_node(temp_state)
#             semantic_answer = semantic_state.get("answer", "")
#             print("\nüí¨ SEMANTIC RESULT (from Retriever):")
#             print(semantic_answer)
#         else:
#             print("‚ö†Ô∏è No semantic part found.")

#         # 4Ô∏è‚É£ Validation: check for missing or inconsistent entities
#         if numeric_answer and semantic_answer:
#             validation_prompt = f"""
#             Check if the following semantic answer is factually consistent with the numeric SQL result.
#             Highlight missing or inconsistent details, especially names, numbers, or facts.

#             Numeric result:
#             {numeric_answer}

#             Semantic result:
#             {semantic_answer}

#             Reply with:
#             {{
#                 "is_consistent": true/false,
#                 "issues": "describe discrepancies briefly"
#             }}
#             """
#             validation = llm.invoke(validation_prompt).content.strip()
#             print("\nüîç Validation result:", validation)

#             try:
#                 validation_json = json.loads(validation.replace("```json", "").replace("```", "").strip())
#                 consistent = validation_json.get("is_consistent", True)
#             except:
#                 consistent = True  # fail-safe

#             # 5Ô∏è‚É£ If inconsistent ‚Äî retry semantic reasoning with context
#             if not consistent:
#                 print("‚ö†Ô∏è Semantic answer inconsistent with SQL ‚Äî retrying with numeric context...")
#                 retry_prompt = f"""
#                 The user asked: {question}

#                 The numeric SQL result says:
#                 {numeric_answer}

#                 The previous semantic answer was:
#                 {semantic_answer}

#                 Please re-generate a consistent, unified answer that aligns with the numeric facts.
#                 """
#                 semantic_answer = llm.invoke(retry_prompt).content.strip()
#                 print("\n‚ôªÔ∏è RETRIED SEMANTIC ANSWER:")
#                 print(semantic_answer)

#         # 6Ô∏è‚É£ Final combination
#         print("\nüß† Combining numeric & semantic results...")
#         combine_prompt = f"""
#         The user originally asked: {question}

#         Numeric insight (SQL result):
#         {numeric_answer or "None"}

#         Semantic insight (retrieved/explained result):
#         {semantic_answer or "None"}

#         Combine both into one clear, factual, and natural-language answer.
#         Avoid SQL or technical references.
#         """
#         combined_response = llm.invoke(combine_prompt)
#         final_answer = getattr(combined_response, "content", str(combined_response)).strip()

#         # print("\nüí° FINAL COMBINED ANSWER:")
#         # print(final_answer)
#         state["answer"] = final_answer

#     except Exception as e:
#         print(f"‚ùå Error in hybrid_node: {str(e)}")
#         state["answer"] = f"Error in hybrid_node: {str(e)}"

#     return state




graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ").strip()
        print(f"You :",user_input)
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye üëã")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}\n")


In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re
import duckdb
import pandas as pd
from openai import OpenAI
from langchain.chat_models import ChatOpenAI


import duckdb
import pandas as pd

# ‚úÖ Step 1: Define paths
csv_path = r"D:\RAG Task\Client_Shipment_Orders.csv"
db_path = r"D:\RAG Task\orders.duckdb"

# ‚úÖ Step 2: Create or update database table (runs only once)
with duckdb.connect(db_path) as con:
    # Create the 'orders' table if not already present
    con.execute(f"""
        CREATE TABLE IF NOT EXISTS orders AS
        SELECT * FROM read_csv_auto('{csv_path}');
    """)
    # Optional: Refresh data if you‚Äôve updated CSV
    # con.execute(f"DELETE FROM orders; INSERT INTO orders SELECT * FROM read_csv_auto('{csv_path}');")

# ‚úÖ Step 3: Load DataFrame safely for local use
with duckdb.connect(db_path) as con:
    df = con.execute("SELECT * FROM orders").fetchdf()

# ‚úÖ Step 4: Clean / normalize text columns
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip().str.title()

# ‚úÖ Step 5: Define persistent directory for embeddings / Chroma
persist_directory = r"D:\RAG Task"




sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.

----------------------------------
TABLE INFORMATION
----------------------------------
Table name: orders  
Columns and their meanings:
- Order ID: Unique identifier for each order (text)
- Client Name: Name of the customer who placed the order (text)
- Email: Email address of the client (text)
- Contact Number: Client's contact phone number (text)
- Origin: Source location of the shipment (text)
- Destination: Delivery location of the shipment (text)
- Product Name: Name of the purchased product (text)
- Category: Product category (e.g., Furniture, Decor, Appliances)
- Material: Material type of the product (e.g., Wood, Glass, Metal)
- Color: Color of the product (text)
- Quantity: Number of units ordered (integer)
- Unit Price (‚Çπ): Price per unit in INR (numeric)
- Total Price (‚Çπ): Total order price in INR (numeric)
- Order Date: Date when the order was placed (date)
- Delivery Date: Date when the order was delivered (date)
- Status: Order status (e.g., Delivered, Pending, Cancelled)

----------------------------------
SAMPLE DATA
----------------------------------
ORD0001 | Kara Mata | chelsea75@yahoo.com | 038.830.3017x8206 | Port Mariamouth | Cohenmouth | Wall Art | Decor | Glass | Grey | 15 | 29878 | 448170 | 2025-05-13 | 2025-06-02 | Cancelled  
ORD0002 | Jesse Williams | ccasey@barrett.info | (426)505-2355 | Tamaraview | Lake Rickyport | Bed | Furniture | Glass | Brown | 30 | 1507 | 45210 | 2025-10-04 | 2025-11-03 | Cancelled  

----------------------------------
INSTRUCTIONS
----------------------------------
1. Generate SQL queries **only** for structured or numeric filters.
   Examples:
   - Total sales, sum, count, average, quantity, or price-based questions  
   - Filtering by columns such as Status, Category, Material, or Color  
   - Date-based filters (e.g., orders after 2025-05-01)

2. **Do NOT** generate queries based on subjective or descriptive logic
   such as reasons for cancellation, customer feedback, or preferences.
   These are handled separately by a semantic retriever system.

3. Use the correct table name `orders` and column names **exactly as shown**.
   Preserve proper case and special characters (e.g., `"Total Price (‚Çπ)"`).

4. Never hallucinate columns, tables, or calculations that do not exist.

5. Return **only** the SQL query ‚Äî no markdown, comments, or explanations.

6. **SAFETY RULES ‚Äî STRICTLY ENFORCED**
   - Never modify or delete data.
   - Do not use or suggest `UPDATE`, `DELETE`, `INSERT`, `DROP`, `TRUNCATE`, or `ALTER`.
   - Do not create or alter schemas, indexes, or tables.
   - Only allow read-only operations:  
     `SELECT`, `WHERE`, `GROUP BY`, `ORDER BY`, `LIMIT`, and aggregate functions (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`).

7. **Case Handling:**  
   When matching text values (like product or status), use `LOWER()` to make comparisons case-insensitive.  
   Example:  
   `WHERE LOWER("Product Name") = LOWER('Toilet Bowl')`

8. **Special Handling ‚Äî Highest or Maximum Queries:**  
   If the user asks questions like  
   *‚ÄúWho made the highest purchase?‚Äù*,  
   *‚ÄúWhich client has the largest total?‚Äù*, or  
   *‚ÄúTop buyer / maximum purchase amount‚Äù*,  
   use this pattern to avoid grouping errors:
   ```sql
   SELECT "Client Name", "Total Price (‚Çπ)"
   FROM orders
   WHERE "Total Price (‚Çπ)" = (
       SELECT MAX("Total Price (‚Çπ)") FROM orders
   );
"""

#  ROW-WISE LABELLED CHUNK GENERATION

def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(csv_path)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_directory = "D:\RAG Task"
collection_name = "shipment_orders"

vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name=collection_name,
)


vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 5

hybrid_retriever = EnsembleRetriever(
    retrievers= [vector_retriever,keyword_retriever],
    weights=[0.6,0.4]
)

class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


def intent_node(state: dict):
    """Use LLM to classify query intent based on the Orders dataset"""
    question = state["question"]

    intent_prompt = f"""
    You are an intent classifier for user questions over an **Orders dataset**.
    The table contains the following columns:
    Order ID, Client Name, Email, Contact Number, Origin, Destination,
    Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ),
    Total Price (‚Çπ), Order Date, Delivery Date, Status.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if the query involves structured, measurable, or count-based data.
       Examples:
       - "How many orders are pending?"
       - "What is the total sales amount?"
       - "Show the average unit price."
       - "Count the number of clients."
       - "List orders where quantity > 10.
       - "Who made the highest purchase?"
       - "names of customers who ordered bed "

    2. "semantic" ‚Üí if the query involves descriptive or text-based attributes
       such true semantic questions, i.e., ones that are descriptive, interpretive, or text-based, not solvable with SQL filters or numbers.
T       These rely on understanding meaning, patterns, or unstructured context rather than column values.
       Examples:
      - Which customers look like regular buyers of furniture?
      - Which products are most suitable for modern homes?
      - What type of products are popular in Port Mariamouth?
      
    3. "hybrid" ‚Üí if the query mixes both numeric and descriptive components.
       Examples:
       - What is the total count of clients who bought curtains and Which destination cities frequently receive d√©cor orders?? 

    4. "greet" ‚Üí greetings or conversational openers.
       Examples:
       - "Hi", "Hello", "Good morning", "Hey there"

    5. "ignore" ‚Üí unrelated or irrelevant to order data.
       Examples:
       - "Tell me a joke", "What's the time?", "Who is the CEO?"

    Question: {question}

    Return only one word:
    numeric, semantic, hybrid, greet, or ignore.
    """

    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state



llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]
from sqlglot import parse_one
import re

def sql_validator_node(state: dict):
    """Validates generated SQL to ensure it's safe and valid for DuckDB execution"""
    sql_query = state.get("sql_query", "").strip()
    print(f"üß© Validating SQL query: {sql_query}")

    # ‚úÖ 1. Ensure it's a SELECT query
    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries are allowed."
        return state

    # ‚ùå 2. Block dangerous operations
    forbidden_keywords = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden_keywords):
        state["validation_error"] = (
            f"‚ùå Unsafe SQL operation detected. "
            f"Keywords like {', '.join(forbidden_keywords)} are not allowed."
        )
        return state

    # ‚úÖ 3. Validate columns used in SQL (prevent hallucinated fields)
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column name used: '{match}'."
            return state

    # ‚úÖ 4. Passed all checks
    state["validation_error"] = None
    print("‚úÖ SQL validation passed.")
    return state

def duckdb_node(state: dict):
    """Handles numeric/structured questions ‚Äî validates SQL with SQLGlot before executing"""
    query = state["question"]

    try:
        # üß† Step 1: Ask LLM to generate SQL
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # üßπ Step 2: Clean LLM formatting
        sql_query = (
            sql_query.replace("```sql", "")
                     .replace("```", "")
                     .replace("`", "")
                     .replace("SQL:", "")
                     .strip()
        )

        print(f"\nüß† Generated SQL query:\n{sql_query}")
        state["sql_query"] = sql_query

        # ‚úÖ Step 3: Syntax validation using SQLGlot
        try:
            parse_one(sql_query)
            print("‚úÖ SQLGlot syntax check passed.")
        except Exception as parse_err:
            state["answer"] = f"‚ö†Ô∏è SQL syntax error detected: {parse_err}"
            return state

        # ‚úÖ Step 4: Custom SQL safety validation
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state

        # ‚úÖ Step 5: Safe execution inside a local DuckDB context
        try:
            with duckdb.connect(db_path) as con:
                result_df = con.execute(sql_query).fetchdf()

            
        except Exception as exec_err:
            state["answer"] = f"‚ö†Ô∏è SQL execution failed: {exec_err}"
            return state

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # ‚úÖ Step 6: Convert results to plain text
        result_text = result_df.to_string(index=False)

        # ‚úÖ Step 7: Generate human-readable summary
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a natural, clear explanation of these results.
        Avoid skipping rows or making assumptions.
        """
        answer = llm.invoke(summary_prompt).content.strip()
        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state

def hybrid_node(state: dict):
    """Handles hybrid queries (numeric + semantic), distinguishes dependent vs independent sub-questions."""
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        # 1Ô∏è‚É£ Improved split prompt ‚Äî includes automatic Client Name addition for dependent queries
        split_prompt = f"""
            You are a **query-splitting assistant** for a hybrid SQL-semantic system.

            Your job is to analyze a user's natural-language question and split it into:
            - "numeric": SQL-based reasoning (e.g., count, sum, max, total, etc.)
            - "semantic": descriptive or entity-based reasoning (names, categories, or context)
            Also detect whether the two parts are **dependent** (the semantic part depends on numeric output)
            or **independent** (can be answered separately).

            ---
            ‚öôÔ∏è SQL Integration Rules:
            - If the query is **dependent**, the SQL generator must combine both numeric & descriptive columns
              such as `"Client Name"` + `"Total Price (‚Çπ)"` (or `"Product Name"` + `"Total Price (‚Çπ)"`) in a single query.
            - For dependent cases, the "numeric" sub-question should still focus on measurable value (e.g., ‚Äúhighest total‚Äù),
              while the "semantic" part should express what entity that value belongs to (e.g., ‚Äúwho made that purchase‚Äù).
            - For independent cases, both sub-questions can be treated separately.

            ---
            ‚úÖ Examples

            **Example 1 ‚Äî Dependent**
            User: "Who made the highest purchase and how much was it?"
            Output:
            {{
                "numeric": "Find the highest purchase amount and the corresponding client name.",
                "semantic": "Who made that purchase and what was the total price?",
                "dependent": true
            }}

            **Example 2 ‚Äî Dependent**
            User: "Which product had the highest total sales and how much revenue did it generate?"
            Output:
            {{
                "numeric": "Find the product with the highest total sales and its total revenue.",
                "semantic": "Which product achieved that and what was the total revenue?",
                "dependent": true
            }}

            **Example 3 ‚Äî Independent**
            User: "How many orders were placed, and who are the top 5 customers?"
            Output:
            {{
                "numeric": "How many orders were placed?",
                "semantic": "Who are the top 5 customers?",
                "dependent": false
            }}

            **Example 4 ‚Äî Independent**
            User: "What is the total revenue and list all product categories?"
            Output:
            {{
                "numeric": "What is the total revenue?",
                "semantic": "List all product categories.",
                "dependent": false
            }}

            ---
            Now process the following user query and return **valid JSON only** (no text outside JSON):

            Question: {question}
        """

        split_result = llm.invoke(split_prompt).content.strip()
        print("üß© Raw Split Result:", split_result)

        import json
        try:
            split_result = split_result.replace("```json", "").replace("```", "").strip()
            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
            dependent = parsed.get("dependent", False)
        except json.JSONDecodeError:
            numeric_part, semantic_part, dependent = "", "", False
            print("‚ö†Ô∏è Invalid JSON in split, skipping.")

        print(f"‚úÖ Parsed numeric part: {numeric_part}")
        print(f"‚úÖ Parsed semantic part: {semantic_part}")
        print(f"üîó Dependency detected: {dependent}")

        # 2Ô∏è‚É£ Get numeric answer from DuckDB
        numeric_answer = ""
        if numeric_part:
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print("\nüßÆ NUMERIC RESULT (from SQL):")
            print(numeric_answer)
        else:
            print("‚ö†Ô∏è No numeric part found.")

        # 3Ô∏è‚É£ Get semantic answer (only for independent queries)
        semantic_answer = ""
        if semantic_part:
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print("\nüí¨ SEMANTIC RESULT (from Retriever):")
            print(semantic_answer)
        else:
            print("‚ö†Ô∏è No semantic part found.")

        # 4Ô∏è‚É£ Validation only for independent queries
        if not dependent and numeric_answer and semantic_answer:
            validation_prompt = f"""
            The following query is INDEPENDENT.
            Check if the semantic result is factually consistent with the numeric SQL result.
            Focus on consistency between numbers, names, or facts.

            Numeric result:
            {numeric_answer}

            Semantic result:
            {semantic_answer}

            Reply only in JSON:
            {{
                "is_consistent": true/false,
                "issues": "describe discrepancies briefly"
            }}
            """
            validation = llm.invoke(validation_prompt).content.strip()
            print("\nüîç Validation result:", validation)

            try:
                validation_json = json.loads(validation.replace("```json", "").replace("```", "").strip())
                consistent = validation_json.get("is_consistent", True)
            except:
                consistent = True  # fallback

            # Retry if inconsistent
            if not consistent:
                print("‚ö†Ô∏è Semantic answer inconsistent with SQL ‚Äî retrying with numeric context...")
                retry_prompt = f"""
                The user asked: {question}

                The numeric SQL result says:
                {numeric_answer}

                The previous semantic answer was:
                {semantic_answer}

                Please re-generate a consistent, unified answer that aligns with the numeric facts.
                """
                semantic_answer = llm.invoke(retry_prompt).content.strip()
                print("\n‚ôªÔ∏è RETRIED SEMANTIC ANSWER:")
                print(semantic_answer)
        else:
            print("‚úÖ Dependent query detected ‚Äî skipping consistency validation.")

        # 5Ô∏è‚É£ Combine results
        print("\nüß† Combining numeric & semantic results...")
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight (SQL result):
        {numeric_answer or "None"}

        Semantic insight (retrieved/explained result):
        {semantic_answer or "None"}

        Combine both into one clear, natural, factual answer.
        If dependent, describe both name and numeric value together (e.g., ‚ÄúKara Mata made the highest purchase of ‚Çπ4,48,170‚Äù).
        """
        combined_response = llm.invoke(combine_prompt)
        final_answer = getattr(combined_response, "content", str(combined_response)).strip()

        state["answer"] = final_answer

    except Exception as e:
        print(f"‚ùå Error in hybrid_node: {str(e)}")
        state["answer"] = f"Error in hybrid_node: {str(e)}"

    return state


graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ").strip()
        print(f"You :",user_input)
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye üëã")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}\n")


In [None]:
conda install ipykernel --update-deps --force-reinstall

In [1]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re
from openai import OpenAI
import os
from sqlglot import parse_one
import json

# ‚úÖ Step 1: Define paths
csv_path = r"D:\RAG Task\Client_Shipment_Orders.csv"
db_path = r"D:\RAG Task\orders.duckdb"

# ‚úÖ Step 2: Create or update database table (runs only once)
with duckdb.connect(db_path) as con:
    # Create the 'orders' table if not already present
    con.execute(f"""
        CREATE TABLE IF NOT EXISTS orders AS
        SELECT * FROM read_csv_auto('{csv_path}');
    """)
    # Optional: Refresh data if you've updated CSV
    # con.execute(f"DELETE FROM orders; INSERT INTO orders SELECT * FROM read_csv_auto('{csv_path}');")

# ‚úÖ Step 3: Load DataFrame safely for local use
with duckdb.connect(db_path) as con:
    df = con.execute("SELECT * FROM orders").fetchdf()

# ‚úÖ Step 4: Clean / normalize text columns
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip().str.title()

# ‚úÖ Step 5: Define persistent directory for embeddings / Chroma
persist_directory = r"D:\RAG Task"

sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.

----------------------------------
TABLE INFORMATION
----------------------------------
Table name: orders  
Columns and their meanings:
- Order ID: Unique identifier for each order (text)
- Client Name: Name of the customer who placed the order (text)
- Email: Email address of the client (text)
- Contact Number: Client's contact phone number (text)
- Origin: Source location of the shipment (text)
- Destination: Delivery location of the shipment (text)
- Product Name: Name of the purchased product (text)
- Category: Product category (e.g., Furniture, Decor, Appliances)
- Material: Material type of the product (e.g., Wood, Glass, Metal)
- Color: Color of the product (text)
- Quantity: Number of units ordered (integer)
- Unit Price (‚Çπ): Price per unit in INR (numeric)
- Total Price (‚Çπ): Total order price in INR (numeric)
- Order Date: Date when the order was placed (date)
- Delivery Date: Date when the order was delivered (date)
- Status: Order status (e.g., Delivered, Pending, Cancelled)

----------------------------------
SAMPLE DATA
----------------------------------
ORD0001 | Kara Mata | chelsea75@yahoo.com | 038.830.3017x8206 | Port Mariamouth | Cohenmouth | Wall Art | Decor | Glass | Grey | 15 | 29878 | 448170 | 2025-05-13 | 2025-06-02 | Cancelled  
ORD0002 | Jesse Williams | ccasey@barrett.info | (426)505-2355 | Tamaraview | Lake Rickyport | Bed | Furniture | Glass | Brown | 30 | 1507 | 45210 | 2025-10-04 | 2025-11-03 | Cancelled  

----------------------------------
INSTRUCTIONS
----------------------------------
1. Generate SQL queries **only** for structured or numeric filters.
   Examples:
   - Total sales, sum, count, average, quantity, or price-based questions  
   - Filtering by columns such as Status, Category, Material, or Color  
   - Date-based filters (e.g., orders after 2025-05-01)

2. **Do NOT** generate queries based on subjective or descriptive logic
   such as reasons for cancellation, customer feedback, or preferences.
   These are handled separately by a semantic retriever system.

3. Use the correct table name `orders` and column names **exactly as shown**.
   Preserve proper case and special characters (e.g., `"Total Price (‚Çπ)"`).

4. Never hallucinate columns, tables, or calculations that do not exist.

5. Return **only** the SQL query ‚Äî no markdown, comments, or explanations.

6. **SAFETY RULES ‚Äî STRICTLY ENFORCED**
   - Never modify or delete data.
   - Do not use or suggest `UPDATE`, `DELETE`, `INSERT`, `DROP`, `TRUNCATE`, or `ALTER`.
   - Do not create or alter schemas, indexes, or tables.
   - Only allow read-only operations:  
     `SELECT`, `WHERE`, `GROUP BY`, `ORDER BY`, `LIMIT`, and aggregate functions (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`).

7. **Case Handling:**  
   When matching text values (like product or status), use `LOWER()` to make comparisons case-insensitive.  
   Example:  
   `WHERE LOWER("Product Name") = LOWER('Toilet Bowl')`

8. **Special Handling ‚Äî Highest or Maximum Queries:**  
   If the user asks questions like  
   *"Who made the highest purchase?"*,  
   *"Which client has the largest total?"*, or  
   *"Top buyer / maximum purchase amount"*,  
   use this pattern to avoid grouping errors:
   ```sql
   SELECT "Client Name", "Total Price (‚Çπ)"
   FROM orders
   WHERE "Total Price (‚Çπ)" = (
       SELECT MAX("Total Price (‚Çπ)") FROM orders
   );
"""

#  ROW-WISE LABELLED CHUNK GENERATION
def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(csv_path)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
collection_name = "shipment_orders"

# ‚úÖ Fixed: Check if vector store already exists to prevent crashes
if os.path.exists(os.path.join(persist_directory, collection_name)):
    print("‚úÖ Loading existing vector store...")
    vector_store = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
        collection_name=collection_name
    )
else:
    print("‚úÖ Creating new vector store...")
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory,
        collection_name=collection_name,
    )

vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 5

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, keyword_retriever],
    weights=[0.6, 0.4]
)

class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


def intent_node(state: dict):
    """Use LLM to classify query intent based on the Orders dataset"""
    question = state["question"]

    intent_prompt = f"""
    You are an intent classifier for user questions over an **Orders dataset**.
    The table contains the following columns:
    Order ID, Client Name, Email, Contact Number, Origin, Destination,
    Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ),
    Total Price (‚Çπ), Order Date, Delivery Date, Status.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if the query involves structured, measurable, or count-based data.
       Examples:
       - "How many orders are pending?"
       - "What is the total sales amount?"
       - "Show the average unit price."
       - "Count the number of clients."
       - "List orders where quantity > 10.
       - "Who made the highest purchase?"
       - "names of customers who ordered bed "

    2. "semantic" ‚Üí if the query involves descriptive or text-based attributes
       such true semantic questions, i.e., ones that are descriptive, interpretive, or text-based, not solvable with SQL filters or numbers.
T       These rely on understanding meaning, patterns, or unstructured context rather than column values.
       Examples:
      - Which customers look like regular buyers of furniture?
      - Which products are most suitable for modern homes?
      - What type of products are popular in Port Mariamouth?
      
    3. "hybrid" ‚Üí if the query mixes both numeric and descriptive components.
       Examples:
       - What is the total count of clients who bought curtains and Which destination cities frequently receive d√©cor orders?? 

    4. "greet" ‚Üí greetings or conversational openers.
       Examples:
       - "Hi", "Hello", "Good morning", "Hey there"

    5. "ignore" ‚Üí unrelated or irrelevant to order data.
       Examples:
       - "Tell me a joke", "What's the time?", "Who is the CEO?"

    Question: {question}

    Return only one word:
    numeric, semantic, hybrid, greet, or ignore.
    """

    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]

def sql_validator_node(state: dict):
    """Validates generated SQL to ensure it's safe and valid for DuckDB execution"""
    sql_query = state.get("sql_query", "").strip()
    print(f"üß© Validating SQL query: {sql_query}")

    # ‚úÖ 1. Ensure it's a SELECT query
    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries are allowed."
        return state

    # ‚ùå 2. Block dangerous operations
    forbidden_keywords = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden_keywords):
        state["validation_error"] = (
            f"‚ùå Unsafe SQL operation detected. "
            f"Keywords like {', '.join(forbidden_keywords)} are not allowed."
        )
        return state

    # ‚úÖ 3. Validate columns used in SQL (prevent hallucinated fields)
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column name used: '{match}'."
            return state

    # ‚úÖ 4. Passed all checks
    state["validation_error"] = None
    print("‚úÖ SQL validation passed.")
    return state

def duckdb_node(state: dict):
    """Handles numeric/structured questions ‚Äî validates SQL with SQLGlot before executing"""
    query = state["question"]

    try:
        # üß† Step 1: Ask LLM to generate SQL
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # üßπ Step 2: Clean LLM formatting
        sql_query = (
            sql_query.replace("```sql", "")
                     .replace("```", "")
                     .replace("`", "")
                     .replace("SQL:", "")
                     .strip()
        )

        print(f"\nüß† Generated SQL query:\n{sql_query}")
        state["sql_query"] = sql_query

        # ‚úÖ Step 3: Syntax validation using SQLGlot
        try:
            parse_one(sql_query)
            print("‚úÖ SQLGlot syntax check passed.")
        except Exception as parse_err:
            state["answer"] = f"‚ö†Ô∏è SQL syntax error detected: {parse_err}"
            return state

        # ‚úÖ Step 4: Custom SQL safety validation
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state

        # ‚úÖ Step 5: Safe execution inside a local DuckDB context with timeout config
        try:
            with duckdb.connect(db_path, config={'temp_directory': r'D:\RAG Task\temp'}) as con:
                result_df = con.execute(sql_query).fetchdf()

        except Exception as exec_err:
            state["answer"] = f"‚ö†Ô∏è SQL execution failed: {exec_err}"
            return state

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # ‚úÖ Step 6: Convert results to plain text
        result_text = result_df.to_string(index=False)

        # ‚úÖ Step 7: Generate human-readable summary
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a natural, clear explanation of these results.
        Avoid skipping rows or making assumptions.
        """
        answer = llm.invoke(summary_prompt).content.strip()
        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state

def hybrid_node(state: dict):
    """Handles hybrid queries (numeric + semantic), distinguishes dependent vs independent sub-questions."""
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        # 1Ô∏è‚É£ Improved split prompt ‚Äî includes automatic Client Name addition for dependent queries
        split_prompt = f"""
            You are a **query-splitting assistant** for a hybrid SQL-semantic system.

            Your job is to analyze a user's natural-language question and split it into:
            - "numeric": SQL-based reasoning (e.g., count, sum, max, total, etc.)
            - "semantic": descriptive or entity-based reasoning (names, categories, or context)
            Also detect whether the two parts are **dependent** (the semantic part depends on numeric output)
            or **independent** (can be answered separately).

            ---
            ‚öôÔ∏è SQL Integration Rules:
            - If the query is **dependent**, the SQL generator must combine both numeric & descriptive columns
              such as `"Client Name"` + `"Total Price (‚Çπ)"` (or `"Product Name"` + `"Total Price (‚Çπ)"`) in a single query.
            - For dependent cases, the "numeric" sub-question should still focus on measurable value (e.g., "highest total"),
              while the "semantic" part should express what entity that value belongs to (e.g., "who made that purchase").
            - For independent cases, both sub-questions can be treated separately.

            ---
            ‚úÖ Examples

            **Example 1 ‚Äî Dependent**
            User: "Who made the highest purchase and how much was it?"
            Output:
            {{
                "numeric": "Find the highest purchase amount and the corresponding client name.",
                "semantic": "Who made that purchase and what was the total price?",
                "dependent": true
            }}

            **Example 2 ‚Äî Dependent**
            User: "Which product had the highest total sales and how much revenue did it generate?"
            Output:
            {{
                "numeric": "Find the product with the highest total sales and its total revenue.",
                "semantic": "Which product achieved that and what was the total revenue?",
                "dependent": true
            }}

            **Example 3 ‚Äî Independent**
            User: "How many orders were placed, and who are the top 5 customers?"
            Output:
            {{
                "numeric": "How many orders were placed?",
                "semantic": "Who are the top 5 customers?",
                "dependent": false
            }}

            **Example 4 ‚Äî Independent**
            User: "What is the total revenue and list all product categories?"
            Output:
            {{
                "numeric": "What is the total revenue?",
                "semantic": "List all product categories.",
                "dependent": false
            }}

            ---
            Now process the following user query and return **valid JSON only** (no text outside JSON):

            Question: {question}
        """

        split_result = llm.invoke(split_prompt).content.strip()
        print("üß© Raw Split Result:", split_result)

        try:
            split_result = split_result.replace("```json", "").replace("```", "").strip()
            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
            dependent = parsed.get("dependent", False)
        except json.JSONDecodeError:
            numeric_part, semantic_part, dependent = "", "", False
            print("‚ö†Ô∏è Invalid JSON in split, skipping.")

        print(f"‚úÖ Parsed numeric part: {numeric_part}")
        print(f"‚úÖ Parsed semantic part: {semantic_part}")
        print(f"üîó Dependency detected: {dependent}")

        # 2Ô∏è‚É£ Get numeric answer from DuckDB
        numeric_answer = ""
        if numeric_part:
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print("\nüßÆ NUMERIC RESULT (from SQL):")
            print(numeric_answer)
        else:
            print("‚ö†Ô∏è No numeric part found.")

        # 3Ô∏è‚É£ Get semantic answer (only for independent queries)
        semantic_answer = ""
        if semantic_part:
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print("\nüí¨ SEMANTIC RESULT (from Retriever):")
            print(semantic_answer)
        else:
            print("‚ö†Ô∏è No semantic part found.")

        # 4Ô∏è‚É£ Validation only for independent queries
        if not dependent and numeric_answer and semantic_answer:
            validation_prompt = f"""
            The following query is INDEPENDENT.
            Check if the semantic result is factually consistent with the numeric SQL result.
            Focus on consistency between numbers, names, or facts.

            Numeric result:
            {numeric_answer}

            Semantic result:
            {semantic_answer}

            Reply only in JSON:
            {{
                "is_consistent": true/false,
                "issues": "describe discrepancies briefly"
            }}
            """
            validation = llm.invoke(validation_prompt).content.strip()
            print("\nüîç Validation result:", validation)

            try:
                validation_json = json.loads(validation.replace("```json", "").replace("```", "").strip())
                consistent = validation_json.get("is_consistent", True)
            except:
                consistent = True  # fallback

            # Retry if inconsistent
            if not consistent:
                print("‚ö†Ô∏è Semantic answer inconsistent with SQL ‚Äî retrying with numeric context...")
                retry_prompt = f"""
                The user asked: {question}

                The numeric SQL result says:
                {numeric_answer}

                The previous semantic answer was:
                {semantic_answer}

                Please re-generate a consistent, unified answer that aligns with the numeric facts.
                """
                semantic_answer = llm.invoke(retry_prompt).content.strip()
                print("\n‚ôªÔ∏è RETRIED SEMANTIC ANSWER:")
                print(semantic_answer)
        else:
            print("‚úÖ Dependent query detected ‚Äî skipping consistency validation.")

        # 5Ô∏è‚É£ Combine results
        print("\nüß† Combining numeric & semantic results...")
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight (SQL result):
        {numeric_answer or "None"}

        Semantic insight (retrieved/explained result):
        {semantic_answer or "None"}

        Combine both into one clear, natural, factual answer.
        If dependent, describe both name and numeric value together (e.g., "Kara Mata made the highest purchase of ‚Çπ4,48,170").
        """
        combined_response = llm.invoke(combine_prompt)
        final_answer = getattr(combined_response, "content", str(combined_response)).strip()

        state["answer"] = final_answer

    except Exception as e:
        print(f"‚ùå Error in hybrid_node: {str(e)}")
        state["answer"] = f"Error in hybrid_node: {str(e)}"

    return state


graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        try:
            user_input = input("You: ").strip()
            print(f"You: {user_input}")
            if user_input.lower() in ["exit", "quit"]:
                print("Assistant: Goodbye üëã")
                break

            result = app.invoke({"question": user_input})
            print(f"Assistant: {result['answer']}\n")
        
        except KeyboardInterrupt:
            print("\nAssistant: Goodbye üëã")
            break
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            print("Please try again with a different question.\n")

‚úÖ Generated 50 labelled chunks for embeddings.
‚úÖ Creating new vector store...


: 