In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict,List
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import duckdb
import pandas as pd



csv_path = "students.csv"



llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

con = duckdb.connect()

#with this ‚Äî DuckDB reads CSV natively

con.execute(f"""
    CREATE OR REPLACE TABLE students AS
    SELECT * FROM read_csv_auto('{csv_path}', header=True)
""")

print(" DuckDB table 'students' loaded directly from CSV!")



sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `students`.

Table name: students  
Columns and their meanings:
- StudentID: Unique ID of the student (integer)
- Name: Student's full name (text)
- Department: Department of study (e.g., Computer Science, Electrical, Mechanical)
- Grade: Academic grade (A+, A, A-, B+, etc.)
- GPA: Grade Point Average (numeric, e.g., 3.8)
- Feedback: Text description of the student's performance and expertise.

Sample data:
1 | Alice | Computer Science | A | 3.9 | Excellent in IoT and AI projects
2 | Bob | Electrical | B+ | 3.4 | Good in circuit design and teamwork
3 | Carol | Mechanical | A- | 3.7 | Great at robotics and embedded systems

----------------------------------
INSTRUCTIONS:
----------------------------------
1. Generate SQL queries **only** for numeric or structured filters.  
   Examples:
   - GPA > 3.8  
   - Grade = 'A'  
   - Department = 'Computer Science' when asked to list the student's department

2. **Do NOT** generate or include text-based or descriptive filters  
   such as expertise, feedback content, interests, or skills (e.g., ‚ÄúIoT‚Äù, ‚ÄúAI‚Äù, ‚Äúleadership‚Äù).  
   Those are handled separately by another retriever system 

3. Use the correct table name `students` and column names exactly as given.

4. Never hallucinate new columns or tables.

5. Return only the SQL query ‚Äî no markdown, explanations, or additional commentary.



Example valid queries:
- SELECT Name, GPA FROM students WHERE GPA > 3.8;
- SELECT * FROM students WHERE Department = 'Computer Science' AND Grade = 'A';
"""


loader = CSVLoader(file_path=csv_path, encoding="utf-8")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(texts, embeddings)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

class GraphState(TypedDict):
    question : str
    intent : str
    context : List[str]
    answer:str
   
   
def intent_node(state: GraphState):
    """Use LLM to classify query intent"""
    query = state["question"]
    intent_prompt = f"""
        You are an intent classifier for user questions over a student dataset.
    The table contains columns: StudentID, Name, Department, Grade, GPA, Feedback.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if it involves:
    - filters or comparisons on structured fields like Department, Grade, GPA, or StudentID
    - examples: "List students in Computer Science", "Show students with Grade A", "Who has GPA > 3.5"

    2. "semantic" ‚Üí if it involves open-ended descriptions, expertise, or meanings inside text fields like Feedback
    - examples: "Who is good at AI?", "Which student has leadership skills?"

    3. "hybrid" ‚Üí if it mixes both structured filters and descriptive parts
    - example: "List students in Computer Science who are good at AI"

    4. "greet" ‚Üí greetings like "Hi", "Hello"

    5. "ignore" ‚Üí if it's unrelated to student data

    Question: {query}
    Return only one word : : numeric, semantic, hybrid, greet, or ignore.
    """
    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f" Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: GraphState):
    state["answer"] = "Hello! üëã How can I assist you with the student data today?"
    return state


def ignore_node(state: GraphState):
    state["answer"] = "I'm here to answer questions about the student dataset. Could you ask something related to that?"
    return state


def duckdb_node(state: GraphState):
    """Numeric / structured question handler with natural output"""
    query = state["question"]
    try:
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # Clean SQL
        sql_query = (
            sql_query.replace("```sql", "")
            .replace("```", "")
            .replace("`", "")
            .replace("SQL:", "")
            .strip()
        )

        print(f" SQL query: {sql_query}")

        # Execute SQL on DuckDB
        result_df = con.execute(sql_query).fetchdf()

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # Convert all rows to text
        result_text = result_df.to_string(index=False)

        # LLM to summarize results naturally
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a clear and complete natural language response that lists all relevant names or details.
        Do not skip or summarize results.
        """
        answer = llm.invoke(summary_prompt).content.strip()

        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state


def retriever_node(state: GraphState):
    """Semantic / descriptive question handler"""
    question = state["question"]
    try:
        retrieved_chunks = retriever.invoke(question)
        pairs = [(question, doc.page_content) for doc in retrieved_chunks]
       
        top_docs = retrieved_chunks[:3]
        context = "\n".join([doc.page_content for doc in top_docs])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {str(e)}"
    return state



def hybrid_node(state: GraphState):
    """Handles hybrid queries by splitting into numeric & semantic sub-questions"""
    question = state["question"]

    try:
        # Step 1Ô∏è: Ask LLM to split the hybrid query into numeric and semantic subparts
        split_prompt = f"""
            Split the user query into numeric and semantic parts.
        - Numeric parts are those answerable via SQL (count, average, filter, etc.).
        - Semantic parts are descriptive (skills, feedback, comments, etc.).
        - If there are multiple sub-questions, list them in an array.

        Return clean JSON like:
        {{
            "numeric": "subquestion for numeric logic",
            "semantic": "subquestion for semantic logic"
        }}

        Question: {question}
        """

        split_result = llm.invoke(split_prompt).content.strip()
        print(" Split result:", split_result)

        import json
        try:
            #  Clean any code block wrappers before parsing
            split_result = (
                split_result.replace("```json", "")
                            .replace("```", "")
                            .strip()
            )

            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
        except json.JSONDecodeError:
            numeric_part = ""
            semantic_part = ""
            print(" LLM didn't return valid JSON ‚Äî skipping split.")

        # Step 2Ô∏è: Route the numeric part to duckdb_node
        numeric_answer = ""
        if numeric_part:
            print(f" Sending numeric part to duckdb_node: {numeric_part}")
            temp_state = {"question": numeric_part}
            temp_state = {
                "question": numeric_part,
                "inttent": "",
                "context": [],
                "answer": ""
        }
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")

        # Step 3Ô∏è: Route the semantic part to retriever_node
        semantic_answer = ""
        if semantic_part:
            print(f" Sending semantic part to retriever_node: {semantic_part}")
            temp_state = {"question": semantic_part}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")

        # Step 4Ô∏è: Merge both results using LLM
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight:
        {numeric_answer}

        Semantic insight:
        {semantic_answer}

        Combine these into a single, clear and concise final answer.
        """
        final_answer = llm.invoke(combine_prompt).content.strip()
        print(" Final answer:", final_answer)

        state["answer"] = final_answer

    except Exception as e:
        state["answer"] = f"Error in hybrid node: {str(e)}"

    return state


graph = StateGraph(GraphState)

graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")


graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
        "answer": "retriever"  
    },
)


graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
 
    while True:
        user_input = input("\nYou: ")
        
        print(f"You :{user_input}")
    
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye! ")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}")


 DuckDB table 'students' loaded directly from CSV!
You :what is the total count of students?
 Detected Intent: ignore
Assistant: I'm here to answer questions about the student dataset. Could you ask something related to that?
You :who scored more than 3.5 gpa?
 Detected Intent: numeric
 SQL query: SELECT * FROM students WHERE GPA > 3.5;
Assistant: The following students scored more than a 3.5 GPA:

1. **Ananya Sharma**  
   - Department: Computer Science  
   - Grade: A+  
   - GPA: 3.90  
   - Feedback: Ananya consistently excels in AI-related projects and leads her team effectively.

2. **Sneha Rao**  
   - Department: Electronics  
   - Grade: A  
   - GPA: 3.70  
   - Feedback: Sneha has a deep understanding of microcontrollers and circuit simulation.

3. **Diya Thomas**  
   - Department: Computer Science  
   - Grade: A  
   - GPA: 3.80  
   - Feedback: Diya is outstanding in Python programming and debugging complex code.

4. **Meera Joseph**  
   - Department: Information Techno