In [None]:
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
from typing import TypedDict, List
from langgraph.graph import StateGraph, START, END
from sentence_transformers import CrossEncoder
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_core.messages import HumanMessage
from langchain_experimental.agents import create_pandas_dataframe_agent
import pandas as pd
import warnings
import os

warnings.filterwarnings("ignore")
load_dotenv()



In [None]:
csv_path = "students.csv"
df = pd.read_csv(csv_path)

In [None]:
class GraphState(TypedDict):
    question : str
    intent : str
    context : List[str]
    answer:str

In [None]:
loader = CSVLoader(file_path=csv_path)
documents = loader.load()
print(len(documents))

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 50)
texts = splitter.split_documents(documents)
print(f"Total chunks created:¬†{len(texts)}")




In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
llm  = ChatOpenAI(model="gpt-4o-mini",temperature=0)
persist_directory = r"D:\RAG Task"
collection_name ="article_new"

In [None]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


In [None]:
vectorstore = Chroma.from_documents(
    embedding=embeddings,
    documents=texts,
    persist_directory=persist_directory,
    collection_name=collection_name
)

In [None]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k":4})
keyword_retriever=BM25Retriever.from_documents(texts)

hybrid_retriever=EnsembleRetriever(
    retrievers=[vector_retriever,keyword_retriever],
    weights=[0.6,0.4]
)

In [None]:
from langchain_experimental.agents import create_pandas_dataframe_agent

pandas_agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    allow_dangerous_code=True  # üëà this is required
)


In [None]:
def intent_node(state : GraphState):
    question = state["question"]
    
    classification_prompt = f"""
    
    Classify the following question into one of these categories:
    1. greeting
    2. relevant_to_document
    3. irrelevant
    
    Question : "{question}"
    
    Reply only with the category name.
    """
    
    result = llm.invoke(classification_prompt).content.strip().lower()
    
    return {"intent" : result}

In [None]:
def greet_node(state:GraphState):
    question = state["question"]
    
    prompt = f"""
    You are a friendly AI assistant.
    The user greeted you with: "{question}".
    Respond politely and naturally, as if you are having a short chat before helping with their document.
    Example responses: "Hey there! How‚Äôs it going?" or "Hello! How can I assist you today?"
    
    """
    
    response = llm.invoke(prompt)
    return {"answer": response.content}

In [None]:
def ignore_node(state:GraphState):
    
    question = state["question"]
    prompt = f"""
    You are an assistant specialized in answering questions related only to a provided document.
    The user asked: "{question}".
    Politely tell the user that you can only answer questions related to the document content.
    Example responses:
    - "I'm sorry, I can only help with questions about the uploaded document."
    - "That seems unrelated to the document. Could you please ask something based on it?"
    """
    response = llm.invoke(prompt)
    return {"answer":response.content}

In [None]:
# def retriever_node(state: GraphState):
#     question = state["question"]

    
#     hybrid_docs = hybrid_retriever.invoke(question)

    
#     pairs = [(question, doc.page_content) for doc in hybrid_docs]

    
#     scores = reranker.predict(pairs)

    
#     ranked_docs = [doc for _, doc in sorted(zip(scores, hybrid_docs), key=lambda x: x[0], reverse=True)]

    
#     top_docs = ranked_docs[:3]

  
#     context = [doc.page_content for doc in top_docs]

#     return {"context": context}

def retriever_node(state: GraphState):
    """Retrieve and re-rank documents"""
    question = state["question"]

    # Retrieve hybrid docs
    hybrid_docs = hybrid_retriever.invoke(question)

    # Pair question with retrieved docs for reranking
    pairs = [(question, doc.page_content) for doc in hybrid_docs]

    # Rerank
    scores = reranker.predict(pairs)
    ranked_docs = [doc for _, doc in sorted(zip(scores, hybrid_docs), key=lambda x: x[0], reverse=True)]

    # Select top 3 docs
    top_docs = ranked_docs[:3]
    context = [doc.page_content for doc in top_docs]

    # Store in state for next node
    state["context"] = context
    return state




In [None]:
# def answer_node(state: GraphState):
#     question = state.get("question", "")
#     context = state.get("context", "")

#     prompt = f"""
#     You are a helpful AI assistant. Use the context below to answer the user's question.

#     Context:
#     {context}

#     Question:
#     {question}

#     If the answer is not found in the context, say "I'm not sure based on the available information."
#     """

#     response = llm.invoke(prompt)
#     return {"answer": response.content}

def answer_node(state: GraphState):
    """Answer using RAG or Pandas Agent"""
    query = state["question"]

    # Simple heuristic: numeric/structured ‚Üí Pandas Agent
    numeric_keywords = ["how many", "average", "count", "sum", "highest", "lowest", "top", "less", "greater", "equal", "mean", "min", "max", "gpa", "department"]

    if any(kw in query.lower() for kw in numeric_keywords):
        print(" Routing to Pandas Agent")
        try:
            answer = pandas_agent.run(query)
        except Exception as e:
            answer = f"Error using Pandas Agent: {str(e)}"
    else:
        print("üìö Using retrieved context")
        context = "\n".join(state.get("context", []))
        prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer briefly:"
        try:
            answer = llm.invoke(prompt).content
        except Exception as e:
            answer = f"Error using RAG: {str(e)}"

    state["answer"] = answer
    return state




In [None]:
def route_from_intent(state:GraphState):
    intent = state["intent"].lower()
    
    if "greeting" in intent:
        return "greet"
    elif "relevant" in intent:
        return "retrieve"
    else:
        return "ignore"

In [None]:
graph = StateGraph(GraphState)


graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("retrieve", retriever_node)
graph.add_node("answer", answer_node)

graph.add_edge(START, "intent")
graph.add_conditional_edges("intent", route_from_intent, ["greet", "retrieve", "ignore"])
graph.add_edge("retrieve", "answer")


graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("answer", END)

app = graph.compile()


In [None]:
print("AI: RAG Assistant is ready! Type 'exit' or 'quit' to stop.\n")

while True:
    user_input = input("You: ")
    print(f"You :{user_input}")
    
    if user_input.lower() in ["exit", "quit"]:
        print("Assistant: Goodbye! ")
        break
    
    try:
        response =response = app.invoke({"question": user_input})
        print("Assistant:", response.get("answer", "No response generated."))
    except Exception as e:
        print("‚ö† Error:", str(e))

In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict,List
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import duckdb
import pandas as pd



csv_path = "students.csv"



llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

con = duckdb.connect()

#with this ‚Äî DuckDB reads CSV natively

con.execute(f"""
    CREATE OR REPLACE TABLE students AS
    SELECT * FROM read_csv_auto('{csv_path}', header=True)
""")

print(" DuckDB table 'students' loaded directly from CSV!")



sql_system_prompt = """
You are a SQL expert helping to query a DuckDB table named `students`.

Table name: students  
Columns and their meanings:
- StudentID: Unique ID of the student (integer)
- Name: Student's full name (text)
- Department: Department of study (e.g., Computer Science, Electrical, Mechanical)
- Grade: Academic grade (A+, A, A-, B+, etc.)
- GPA: Grade Point Average (numeric, e.g., 3.8)
- Feedback: Text description of the student's performance and expertise.

Sample data:
1 | Alice | Computer Science | A | 3.9 | Excellent in IoT and AI projects
2 | Bob | Electrical | B+ | 3.4 | Good in circuit design and teamwork
3 | Carol | Mechanical | A- | 3.7 | Great at robotics and embedded systems

----------------------------------
INSTRUCTIONS:
----------------------------------
1. Generate SQL queries **only** for numeric or structured filters.  
   Examples:
   - GPA > 3.8  
   - Grade = 'A'  
   - Department = 'Computer Science' when asked to list the student's department

2. **Do NOT** generate or include text-based or descriptive filters  
   such as expertise, feedback content, interests, or skills (e.g., ‚ÄúIoT‚Äù, ‚ÄúAI‚Äù, ‚Äúleadership‚Äù).  
   Those are handled separately by another retriever system 

3. Use the correct table name `students` and column names exactly as given.

4. Never hallucinate new columns or tables.

5. Return only the SQL query ‚Äî no markdown, explanations, or additional commentary.



Example valid queries:
- SELECT Name, GPA FROM students WHERE GPA > 3.8;
- SELECT * FROM students WHERE Department = 'Computer Science' AND Grade = 'A';
"""


loader = CSVLoader(file_path=csv_path, encoding="utf-8")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(texts, embeddings)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

class GraphState(TypedDict):
    question : str
    intent : str
    context : List[str]
    answer:str
   
   
def intent_node(state: GraphState):
    """Use LLM to classify query intent"""
    query = state["question"]
    intent_prompt = f"""
        You are an intent classifier for user questions over a student dataset.
    The table contains columns: StudentID, Name, Department, Grade, GPA, Feedback.

    Classify the intent of the question as one of the following:

    1. "numeric" ‚Üí if it involves:
    - filters or comparisons on structured fields like Department, Grade, GPA, or StudentID
    - examples: "List students in Computer Science", "Show students with Grade A", "Who has GPA > 3.5"

    2. "semantic" ‚Üí if it involves open-ended descriptions, expertise, or meanings inside text fields like Feedback
    - examples: "Who is good at AI?", "Which student has leadership skills?"

    3. "hybrid" ‚Üí if it mixes both structured filters and descriptive parts
    - example: "List students in Computer Science who are good at AI"

    4. "greet" ‚Üí greetings like "Hi", "Hello"

    5. "ignore" ‚Üí if it's unrelated to student data

    Question: {query}
    Return only one word : : numeric, semantic, hybrid, greet, or ignore.
    """
    intent = llm.invoke(intent_prompt).content.strip().lower()
    print(f" Detected Intent: {intent}")
    state["intent"] = intent
    return state


def greet_node(state: GraphState):
    state["answer"] = "Hello! üëã How can I assist you with the student data today?"
    return state


def ignore_node(state: GraphState):
    state["answer"] = "I'm here to answer questions about the student dataset. Could you ask something related to that?"
    return state


def duckdb_node(state: GraphState):
    """Numeric / structured question handler with natural output"""
    query = state["question"]
    try:
        sql_prompt = f"{sql_system_prompt}\nUser question: {query}\nSQL:"
        sql_query = llm.invoke(sql_prompt).content.strip()

        # Clean SQL
        sql_query = (
            sql_query.replace("```sql", "")
            .replace("```", "")
            .replace("`", "")
            .replace("SQL:", "")
            .strip()
        )

        print(f" SQL query: {sql_query}")

        # Execute SQL on DuckDB
        result_df = con.execute(sql_query).fetchdf()

        if result_df.empty:
            state["answer"] = "No matching records found."
            return state

        # Convert all rows to text
        result_text = result_df.to_string(index=False)

        # LLM to summarize results naturally
        summary_prompt = f"""
        The user asked: {query}
        The SQL result is:
        {result_text}

        Write a clear and complete natural language response that lists all relevant names or details.
        Do not skip or summarize results.
        """
        answer = llm.invoke(summary_prompt).content.strip()

        state["answer"] = answer

    except Exception as e:
        state["answer"] = f"Error executing SQL: {str(e)}"

    return state


def retriever_node(state: GraphState):
    """Semantic / descriptive question handler"""
    question = state["question"]
    try:
        retrieved_chunks = retriever.invoke(question)
        pairs = [(question, doc.page_content) for doc in retrieved_chunks]
       
        top_docs = retrieved_chunks[:3]
        context = "\n".join([doc.page_content for doc in top_docs])
        prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer briefly:"
        answer = llm.invoke(prompt).content
        state["answer"] = answer
    except Exception as e:
        state["answer"] = f"Error using retriever: {str(e)}"
    return state



def hybrid_node(state: GraphState):
    """Handles hybrid queries by splitting into numeric & semantic sub-questions"""
    question = state["question"]

    try:
        # Step 1Ô∏è: Ask LLM to split the hybrid query into numeric and semantic subparts
        split_prompt = f"""
            Split the user query into numeric and semantic parts.
        - Numeric parts are those answerable via SQL (count, average, filter, etc.).
        - Semantic parts are descriptive (skills, feedback, comments, etc.).
        - If there are multiple sub-questions, list them in an array.

        Return clean JSON like:
        {{
            "numeric": "subquestion for numeric logic",
            "semantic": "subquestion for semantic logic"
        }}

        Question: {question}
        """

        split_result = llm.invoke(split_prompt).content.strip()
        print(" Split result:", split_result)

        import json
        try:
            #  Clean any code block wrappers before parsing
            split_result = (
                split_result.replace("```json", "")
                            .replace("```", "")
                            .strip()
            )

            parsed = json.loads(split_result)
            numeric_part = parsed.get("numeric", "").strip()
            semantic_part = parsed.get("semantic", "").strip()
        except json.JSONDecodeError:
            numeric_part = ""
            semantic_part = ""
            print(" LLM didn't return valid JSON ‚Äî skipping split.")

        # Step 2Ô∏è: Route the numeric part to duckdb_node
        numeric_answer = ""
        if numeric_part:
            print(f" Sending numeric part to duckdb_node: {numeric_part}")
            temp_state = {"question": numeric_part}
            temp_state = {
                "question": numeric_part,
                "inttent": "",
                "context": [],
                "answer": ""
        }
            numeric_state = duckdb_node(temp_state)
            numeric_answer = numeric_state.get("answer", "")

        # Step 3Ô∏è: Route the semantic part to retriever_node
        semantic_answer = ""
        if semantic_part:
            print(f" Sending semantic part to retriever_node: {semantic_part}")
            temp_state = {"question": semantic_part}
            semantic_state = retriever_node(temp_state)
            semantic_answer = semantic_state.get("answer", "")

        # Step 4Ô∏è: Merge both results using LLM
        combine_prompt = f"""
        The user originally asked: {question}

        Numeric insight:
        {numeric_answer}

        Semantic insight:
        {semantic_answer}

        Combine these into a single, clear and concise final answer.
        """
        final_answer = llm.invoke(combine_prompt).content.strip()
        print(" Final answer:", final_answer)

        state["answer"] = final_answer

    except Exception as e:
        state["answer"] = f"Error in hybrid node: {str(e)}"

    return state


graph = StateGraph(GraphState)

graph.add_node("intent", intent_node)
graph.add_node("greet", greet_node)
graph.add_node("ignore", ignore_node)
graph.add_node("duckdb", duckdb_node)
graph.add_node("retriever", retriever_node)
graph.add_node("hybrid", hybrid_node)

graph.set_entry_point("intent")


graph.add_conditional_edges(
    "intent",
    lambda state: state["intent"],
    {
        "greet": "greet",
        "ignore": "ignore",
        "numeric": "duckdb",
        "semantic": "retriever",
        "hybrid": "hybrid",
        "answer": "retriever"  
    },
)


graph.add_edge("greet", END)
graph.add_edge("ignore", END)
graph.add_edge("duckdb", END)
graph.add_edge("retriever", END)
graph.add_edge("hybrid", END)

app = graph.compile()

if __name__ == "__main__":
 
    while True:
        user_input = input("\nYou: ")
        
        print(f"You :{user_input}")
    
        if user_input.lower() in ["exit", "quit"]:
            print("Assistant: Goodbye! ")
            break

        result = app.invoke({"question": user_input})
        print(f"Assistant: {result['answer']}")


 DuckDB table 'students' loaded directly from CSV!
You :what is the feedback for Akshay rao and count the number of students?
 Detected Intent: hybrid
 Split result: {
    "numeric": "count the number of students",
    "semantic": "what is the feedback for Akshay Rao"
}
 Sending numeric part to duckdb_node: count the number of students
 SQL query: SELECT COUNT(*) FROM students;
 Sending semantic part to retriever_node: what is the feedback for Akshay Rao
 Final answer: The feedback for Akshay Rao indicates that he requires improvement in report presentation. Additionally, there are a total of 50 students.
Assistant: The feedback for Akshay Rao indicates that he requires improvement in report presentation. Additionally, there are a total of 50 students.
You :exit
Assistant: Goodbye! 
