In [16]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from sqlalchemy import create_engine, text
from langchain_core.documents import Document 
import pandas as pd
import duckdb

# ==========================================================
# 1Ô∏è‚É£  CSV LOADING + DUCKDB INITIALIZATION
# ==========================================================
CSV_PATH = "Client_Shipment_Orders.csv"

## Load your dataset
df = pd.read_csv(CSV_PATH)

# Create SQLAlchemy engine for DuckDB (in-memory)
engine = create_engine("duckdb:///:memory:")

# Get DuckDB native connection from SQLAlchemy
conn = engine.raw_connection().driver_connection

# Register your DataFrame as a temporary DuckDB table
conn.register("orders_df", df)

# Create a permanent DuckDB table from it
conn.execute("CREATE TABLE orders AS SELECT * FROM orders_df")

print("‚úÖ Connected to DuckDB through SQLAlchemy and created table 'orders'")

# ==========================================================
# 2Ô∏è‚É£  ROW-WISE LABELLED CHUNK GENERATION
# ==========================================================
def generate_labelled_chunks(csv_path):
    """Creates labelled text chunks from each row for embeddings."""
    df = pd.read_csv(csv_path)
    chunks = []
    for index, row in df.iterrows():
        labelled_text = f"Row ID: {index}\n"
        for col in df.columns:
            labelled_text += f"{col}: {row[col]}\n"
        chunks.append(labelled_text.strip())
    return df, chunks


df, labelled_chunks = generate_labelled_chunks(CSV_PATH)

documents = [Document(page_content=chunk) for chunk in labelled_chunks]

print(f"‚úÖ Generated {len(labelled_chunks)} labelled chunks for embeddings.")


# ==========================================================
# 3Ô∏è‚É£  EMBEDDING + VECTORSTORE INITIALIZATION
# ==========================================================
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_directory = "D:\RAG Task"
collection_name = "shipment_orders"

vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name=collection_name,
)


vector_retriever = vector_store.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 4

hybrid_retriever = EnsembleRetriever(
    retrievers= [vector_retriever,keyword_retriever],
    weights=[0.6,0.4]
)


# ==========================================================
# 4Ô∏è‚É£  SQL BUILDER ‚Äî RULE BASED (SAFE)
# ==========================================================
def build_sql_query(question: str) -> str:
    q = question.lower().strip()
    print(f"\nüß† [SQL Builder] Received question ‚Üí {q}")

    if "how many" in q and "client" in q:
        sql = 'SELECT COUNT(DISTINCT "Client Name") AS total_clients FROM orders;'
    elif "total sales" in q or "total price" in q:
        sql = 'SELECT SUM("Total Price (‚Çπ)") AS total_sales FROM orders;'
    elif "average" in q or "avg price" in q:
        sql = 'SELECT AVG("Unit Price (‚Çπ)") AS avg_unit_price FROM orders;'
    elif "total quantity" in q:
        sql = 'SELECT SUM("Quantity") AS total_quantity FROM orders;'
    elif "dining table" in q:
        sql = 'SELECT COUNT(*) AS dining_table_orders FROM orders WHERE LOWER("Product Name") LIKE \'%dining table%\';'
    elif "pending" in q:
        sql = 'SELECT COUNT(*) AS pending_orders FROM orders WHERE LOWER(Status) = \'pending\';'
    elif "delivered" in q:
        sql = 'SELECT COUNT(*) AS delivered_orders FROM orders WHERE LOWER(Status) = \'delivered\';'
    else:
        sql = None
        print("‚ö†Ô∏è [SQL Builder] No rule matched for this query.")

    print(f"üß± [SQL Builder] Generated SQL ‚Üí {sql}")
    return sql


# ==========================================================
# 5Ô∏è‚É£  DUCKDB NODE ‚Äî EXECUTES QUERIES SAFELY
# ==========================================================
def duckdb_node(state: dict) -> dict:
    try:
        question = state.get("question", "")
        print(f"\nüßÆ [DuckDB Node] Executing for ‚Üí {question}")

        sql_query = build_sql_query(question)
        if not sql_query:
            raise ValueError("SQL Builder returned None.")

        with engine.connect() as conn:
            result = conn.execute(text(sql_query))
            rows = result.fetchall()

        if not rows:
            answer = "No matching records found."
        else:
            answer = str(rows[0])

        print(f"‚úÖ [DuckDB Node] Query executed successfully ‚Üí {answer}")
        state["answer"] = answer
        return state

    except Exception as e:
        err = f"Error executing SQL: {e}"
        print(f"‚ö†Ô∏è [DuckDB Node] {err}")
        state["answer"] = err
        return state


# ==========================================================
# 6Ô∏è‚É£  RETRIEVER NODE ‚Äî HANDLES SEMANTIC QUERIES
# ==========================================================
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def retriever_node(state: dict):
    question = state["question"]
    try:
        retrieved_chunks = hybrid_retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in retrieved_chunks])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content.strip()
       
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {e}"
    return state


# ==========================================================
# 7Ô∏è‚É£  INTENT / GREET / IGNORE NODES
# ==========================================================
def intent_node(state: dict):
    question = state["question"]
    prompt = f"""
    Classify this question as one of:
    1. numeric ‚Üí structured, count, total, avg
    2. semantic ‚Üí descriptive (clients, items, color, etc.)
    3. hybrid ‚Üí both numeric + semantic
    4. greet ‚Üí greetings
    5. ignore ‚Üí unrelated

    Question: {question}
    Return only one word: numeric, semantic, hybrid, greet, or ignore.
    """
    intent = llm.invoke(prompt).content.strip().lower()
    print(f"üéØ Detected intent ‚Üí {intent}")
    state["intent"] = intent
    return state


def greet_node(state: dict):
    state["answer"] = "Hello üëã! How can I assist you with the order data today?"
    return state


def ignore_node(state: dict):
    state["answer"] = "I'm designed to answer questions about the order dataset. Please ask something related."
    return state


# ==========================================================
# 8Ô∏è‚É£  HYBRID NODE ‚Äî COMBINES NUMERIC + SEMANTIC
# ==========================================================
def hybrid_node(state: dict):
    try:
        question = state["question"]
        print(f"\nüîÄ [Hybrid Node] Received question ‚Üí {question}")

        split_prompt = f"""
        Split the query into numeric and semantic parts in JSON.
        Example:
        {{
          "numeric": "subquestion for numeric logic",
          "semantic": "subquestion for semantic logic"
        }}
        Question: {question}
        """
        split_result = llm.invoke(split_prompt).content.strip()

        import json
        split_result = split_result.replace("```json", "").replace("```", "").strip()
        parsed = json.loads(split_result)
        numeric_part = parsed.get("numeric", "").strip()
        semantic_part = parsed.get("semantic", "").strip()

        print(f"üìä Numeric part ‚Üí {numeric_part}")
        print(f"üí¨ Semantic part ‚Üí {semantic_part}")

        numeric_answer, semantic_answer = "", ""

        if numeric_part:
            print(f"üì§ Sending numeric part to DuckDB: {numeric_part}")
            temp_state = {"question": numeric_part, "intent": "", "context": [], "answer": ""}
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")
            print(f"üì• Received numeric answer ‚Üí {numeric_answer}")

        if semantic_part:
            print(f"üì§ Sending semantic part to Retriever: {semantic_part}")
            temp_state = {"question": semantic_part, "intent": "", "context": [], "answer": ""}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")
            print(f"üì• Received semantic answer ‚Üí {semantic_answer}")

        combine_prompt = f"""
        Combine the following insights into a single, clear answer:
        Numeric insight: {numeric_answer}
        Semantic insight: {semantic_answer}
        """
        final_answer = llm.invoke(combine_prompt).content.strip()
        print(f"üß© Final combined answer ‚Üí {final_answer}")
        state["answer"] = final_answer

    except Exception as e:
        state["answer"] = f"Error in hybrid node: {e}"
    return state


# ==========================================================
# 9Ô∏è‚É£  BUILD STATE GRAPH
# ==========================================================
class GraphState(TypedDict):
    question: str
    intent: str
    context: List[str]
    answer: str


graph = StateGraph(GraphState)
graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")

graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
    },
)

graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

# ==========================================================
# üîü  MAIN LOOP
# ==========================================================
if __name__ == "__main__":
    print("\nüöÄ Smart Query Assistant ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ").strip()
        print(f"You :",user_input)
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye üëã")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}\n")


  persist_directory = "D:\RAG Task"


‚úÖ Connected to DuckDB through SQLAlchemy and created table 'orders'
‚úÖ Generated 70 labelled chunks for embeddings.

üöÄ Smart Query Assistant ready! Type 'exit' to quit.

You : how many products in total?
üéØ Detected intent ‚Üí numeric

üßÆ [DuckDB Node] Executing for ‚Üí how many products in total?

üß† [SQL Builder] Received question ‚Üí how many products in total?
‚ö†Ô∏è [SQL Builder] No rule matched for this query.
üß± [SQL Builder] Generated SQL ‚Üí None
‚ö†Ô∏è [DuckDB Node] Error executing SQL: SQL Builder returned None.
Assistant: Error executing SQL: SQL Builder returned None.

You : who exit
üéØ Detected intent ‚Üí ignore
Assistant: I'm designed to answer questions about the order dataset. Please ask something related.

You : exit
Assistant: Goodbye üëã
