In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document 
import pandas as pd
import duckdb
import re

csv_path = "Client_Shipment_Orders.csv"

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

con = duckdb.connect()

#with this ‚Äî DuckDB reads CSV natively

con.execute(f"""
    CREATE OR REPLACE TABLE orders AS
    SELECT * FROM read_csv_auto('{csv_path}', header=True)
""")

print(" DuckDB table 'orders' loaded directly from CSV!")


sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `orders`.


TABLE INFORMATION

Table name: orders  
Columns and their meanings:
- Order ID: Unique identifier for each order (text)
- Client Name: Name of the customer who placed the order (text)
- Email: Email address of the client (text)
- Contact Number: Client's contact phone number (text)
- Origin: Source location of the shipment (text)
- Destination: Delivery location of the shipment (text)
- Product Name: Name of the purchased product (text)
- Category: Product category (e.g., Furniture, Decor, Appliances)
- Material: Material type of the product (e.g., Wood, Glass, Metal)
- Color: Color of the product (text)
- Quantity: Number of units ordered (integer)
- Unit Price (‚Çπ): Price per unit in INR (numeric)
- Total Price (‚Çπ): Total order price in INR (numeric)
- Order Date: Date when the order was placed (date)
- Delivery Date: Date when the order was delivered (date)
- Status: Order status (e.g., Delivered, Pending, Cancelled)


SAMPLE DATA

ORD0001 | Kara Mata | chelsea75@yahoo.com | 038.830.3017x8206 | Port Mariamouth | Cohenmouth | Wall Art | Decor | Glass | Grey | 15 | 29878 | 448170 | 2025-05-13 | 2025-06-02 | Cancelled  
ORD0002 | Jesse Williams | ccasey@barrett.info | (426)505-2355 | Tamaraview | Lake Rickyport | Bed | Furniture | Glass | Brown | 30 | 1507 | 45210 | 2025-10-04 | 2025-11-03 | Cancelled  


INSTRUCTIONS

1. Generate SQL queries **only** for structured or numeric filters.  
   Examples:
   - Total sales, sum, count, average, quantity, or price-based questions  
   - Filtering by specific columns such as Status, Category, Material, or Color  
   - Date-based filters (e.g., orders after 2025-05-01)

2. **Do NOT** generate queries that rely on descriptive, subjective, or semantic attributes  
   such as client feedback, reasons for cancellation, customer sentiment, or preferences.  
   These are handled separately by another semantic retriever system.

3. Use the correct table name `orders` and column names exactly as given.  
   Preserve proper case and special characters (e.g., `"Total Price (‚Çπ)"`).

4. Never hallucinate columns, tables, or calculations that don‚Äôt exist in the schema.

5. Return **only** the SQL query ‚Äî no markdown, explanations, or commentary.

6.
Do not modify, insert, delete, or drop any data or tables.

Do not perform schema changes such as ALTER, TRUNCATE, or CREATE.

Do not use UPDATE, INSERT, DELETE, DROP, TRUNCATE, or ALTER statements.

Only use safe read-only operations such as:

SELECT

WHERE, GROUP BY, ORDER BY, LIMIT

Aggregation functions (COUNT, SUM, AVG, MIN, MAX)

Never execute or suggest any operation that could change the database.

----------------------------------
Example valid queries:
----------------------------------
- SELECT COUNT(DISTINCT "Client Name") AS total_clients FROM orders;
- SELECT SUM("Total Price (‚Çπ)") AS total_sales FROM orders WHERE LOWER(Status) = 'delivered';
- SELECT AVG("Unit Price (‚Çπ)") AS average_unit_price FROM orders WHERE Category = 'Furniture';
- SELECT * FROM orders WHERE LOWER("Material") = 'glass' AND LOWER(Color) = 'grey';
- SELECT COUNT(*) FROM orders WHERE "Order Date" >= '2025-05-01';
"""


#  ROW-WISE LABELLED CHUNK GENERATION

def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(csv_path)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_directory = "D:\RAG Task"
collection_name = "shipment_orders"

vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name=collection_name,
)


vector_retriever = vector_store.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 4

hybrid_retriever = EnsembleRetriever(
    retrievers= [vector_retriever,keyword_retriever],
    weights=[0.6,0.4]
)

class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


def intent_node(state: dict):
    """Use LLM to classify query intent based on the Orders dataset"""
    question = state["question"]

    intent_prompt = f"""
    You are an intent classifier for user questions over an **Orders dataset**.
    The table contains the following columns:
    Order ID, Client Name, Email, Contact Number, Origin, Destination,
    Product Name, Category, Material, Color, Quantity, Unit Price (‚Çπ),
    Total Price (‚Çπ), Order Date, Delivery Date, Status.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if the query involves structured, measurable, or count-based data.
       Examples:
       - "How many orders are pending?"
       - "What is the total sales amount?"
       - "Show the average unit price."
       - "Count the number of clients."
       - "List orders where quantity > 10."

    2. "semantic" ‚Üí if the query involves descriptive or text-based attributes
       such true semantic questions, i.e., ones that are descriptive, interpretive, or text-based, not solvable with SQL filters or numbers.
T       These rely on understanding meaning, patterns, or unstructured context rather than column values.
       Examples:
      - Which customers look like regular buyers of furniture?
      - Which products are most suitable for modern homes?
      - What type of products are popular in Port Mariamouth?
      
    3. "hybrid" ‚Üí if the query mixes both numeric and descriptive components.
       Examples:
       - What is the total count of clients who bought curtains and Which destination cities frequently receive d√©cor orders?? 

    4. "greet" ‚Üí greetings or conversational openers.
       Examples:
       - "Hi", "Hello", "Good morning", "Hey there"

    5. "ignore" ‚Üí unrelated or irrelevant to order data.
       Examples:
       - "Tell me a joke", "What's the time?", "Who is the CEO?"

    Question: {question}

    Return only one word:
    numeric, semantic, hybrid, greet, or ignore.
    """

    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f"üéØ Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state



llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


VALID_COLUMNS = [
    "Order ID", "Client Name", "Email", "Contact Number",
    "Origin", "Destination", "Product Name", "Category",
    "Material", "Color", "Quantity", "Unit Price (‚Çπ)",
    "Total Price (‚Çπ)", "Order Date", "Delivery Date", "Status"
]

def sql_validator_node(state: dict):
    """Validates generated SQL to ensure it's safe and valid for DuckDB execution"""
    sql_query = state.get("sql_query", "").strip()
    print(f"üß© Validating SQL query: {sql_query}")

    if not sql_query.lower().startswith("select"):
        state["validation_error"] = "‚ùå Only SELECT queries are allowed."
        return state

    # ‚ùå Block unsafe keywords
    forbidden_keywords = ["insert", "update", "delete", "drop", "alter", "truncate", "create"]
    if any(kw in sql_query.lower() for kw in forbidden_keywords):
        state["validation_error"] = f"‚ùå Unsafe SQL operation detected: {', '.join(forbidden_keywords)} are not allowed."
        return state

    # ‚úÖ Ensure valid columns
    for match in re.findall(r'"(.*?)"', sql_query):
        if match not in VALID_COLUMNS:
            state["validation_error"] = f"‚ùå Invalid column name used: '{match}'."
            return state

    state["validation_error"] = None
    print("‚úÖ SQL validation passed.")
    return state


def duckdb_node(state: GraphState):
    """Handles numeric/structured questions ‚Äî validates SQL before executing"""
    query = state["question"]
    try:
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # Clean SQL formatting
        sql_query = (
            sql_query.replace("```sql", "")
                     .replace("```", "")
                     .replace("`", "")
                     .replace("SQL:", "")
                     .strip()
        )

        print(f"üß† Generated SQL query: {sql_query}")
        state["sql_query"] = sql_query

        # ‚úÖ Step 1: Validate SQL before execution
        validation_state = sql_validator_node(state)
        if validation_state.get("validation_error"):
            state["answer"] = validation_state["validation_error"]
            return state

        # ‚úÖ Step 2: Execute SQL safely
        result_df = con.execute(sql_query).fetchdf()

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # Convert results to text
        result_text = result_df.to_string(index=False)

        # ‚úÖ Step 3: Summarize result naturally
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a natural, clear explanation of these results.
        Avoid skipping any rows or adding assumptions.
        """
        answer = llm.invoke(summary_prompt).content.strip()

        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state


# def duckdb_node(state: GraphState):
#     """Numeric / structured question handler with natural output"""
#     query = state["question"]
#     try:
#         sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
#         sql_query = llm.invoke(sql_prompt).content.strip()

#         # Clean SQL
#         sql_query = (
#             sql_query.replace("```sql", "")
#             .replace("```", "")
#             .replace("`", "")
#             .replace("SQL:", "")
#             .strip()
#         )

#         print(f" SQL query: {sql_query}")

#         # Execute SQL on DuckDB
#         result_df = con.execute(sql_query).fetchdf()

#         if result_df.empty:
#             state["answer"] = "No matching records found."
#             return state

#         # Convert all rows to text
#         result_text = result_df.to_string(index=False)

#         # LLM to summarize results naturally
#         summary_prompt = f"""
#         The user asked: {query}
#         The SQL result is:
#         {result_text}

#         Write a clear and complete natural language response that lists all relevant names or details.
#         Do not skip or summarize results.
#         """
#         answer = llm.invoke(summary_prompt).content.strip()

#         state["answer"] = answer

#     except Exception as e:
#         state["answer"] = f"Error executing SQL: {str(e)}"

#     return state
def hybrid_node(state: dict):
    """Handles hybrid queries by splitting into numeric & semantic sub-questions"""
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        # Step 1Ô∏è‚É£ Split the query into numeric & semantic
        split_prompt = f"""
        Split the user query into numeric and semantic parts.
        - Numeric parts: answerable via SQL (count, average, filter, etc.)
        - Semantic parts: descriptive, interpretive, or text-based (not solvable by SQL)
        - Return clean JSON:
        {{
            "numeric": "subquestion for numeric logic",
            "semantic": "subquestion for semantic logic"
        }}

        Question: {question}
        """
        split_result = llm.invoke(split_prompt).content.strip()
        print("üß© Raw Split Result:", split_result)

        import json
        try:
            split_result = (
                split_result.replace("```json", "")
                            .replace("```", "")
                            .strip()
            )
            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
            print(f"‚úÖ Parsed numeric part: {numeric_part}")
            print(f"‚úÖ Parsed semantic part: {semantic_part}")
        except json.JSONDecodeError:
            numeric_part = ""
            semantic_part = ""
            print("‚ö†Ô∏è LLM didn't return valid JSON ‚Äî skipping split.")

        # Step 2Ô∏è‚É£ Numeric logic via DuckDB
        numeric_answer = ""
        if numeric_part:
            print(f"\nüì§ Sending numeric part to DuckDB: {numeric_part}")
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print("\nüßÆ NUMERIC RESULT (from SQL):")
            print(numeric_answer or "No numeric result.")
        else:
            print("‚ö†Ô∏è No numeric part detected.")

        # Step 3Ô∏è‚É£ Semantic logic via Retriever
        semantic_answer = ""
        if semantic_part:
            print(f"\nüì§ Sending semantic part to Retriever: {semantic_part}")
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print("\nüí¨ SEMANTIC RESULT (from Retriever):")
            print(semantic_answer or "No semantic result.")
        else:
            print("‚ö†Ô∏è No semantic part detected.")

        # Step 4Ô∏è‚É£ Combine both results into final response
        print("\nüß† Combining numeric & semantic results...")
        print("=" * 70)
        print("üßÆ NUMERIC RESULT SUMMARY:")
        print(numeric_answer or "No numeric result.")
        print("=" * 70)
        print("üí¨ SEMANTIC RESULT SUMMARY:")
        print(semantic_answer or "No semantic result.")
        print("=" * 70)

        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight (from SQL results):
        {numeric_answer or "None"}

        Semantic insight (from retrieved text data):
        {semantic_answer or "None"}

        Now combine these insights into one clear and factual final answer.
        Do not mention SQL execution or database details.
        Provide a concise, natural explanation.
        """

        combined_response = llm.invoke(combine_prompt)
        final_answer = getattr(combined_response, "content", str(combined_response)).strip()

        # ‚úÖ Print the final combined answer clearly
        print("\nüí° FINAL COMBINED ANSWER:")
        print(final_answer if final_answer else "‚ö†Ô∏è No combined answer generated.")
        print("=" * 70)

        # Save only the final answer for the graph output
        state["answer"] = final_answer

    except Exception as e:
        print(f"‚ùå Error in hybrid_node: {str(e)}")
        state["answer"] = f"Error in hybrid_node: {str(e)}"

    # ‚úÖ return after all prints are complete
    return state

graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ").strip()
        print(f"You :",user_input)
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye üëã")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}\n")
