In [1]:
import glob
import os
import pandas as pd

# Define the directory path
directory_path = r"C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data"
# Find all CSV files in the directory
csv_files = glob.glob(os.path.join(directory_path, "*.csv"))

# List to hold all DataFrames from all CSVs
all_dataframes = []

# Loop through each CSV file and load it
for csv_file in csv_files:
    print(f"Loading file: {csv_file}")
    df = pd.read_csv(csv_file)
    all_dataframes.append(df)

# Summary
print("\n--- Summary ---")
print(f"Number of CSV files loaded: {len(all_dataframes)}")

if all_dataframes:
    print("Columns in first CSV:", list(all_dataframes[0].columns))
    print("\nFirst few rows of first CSV:\n", all_dataframes[0].head())


Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iiit-bhubneswar.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iiitbanglore.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iiitnr_faculty.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iitb.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iiti.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iitj.csv
Loading file: C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data\iitm.csv

--- Summary ---
Number of CSV files loaded: 7
Columns in first CSV: ['data-page-selector', 'professor', 'position', 'email', 'department', 'contact', 'Research_Interests_0', 'Education_-_PhD_0', 'Education_-_MTech_0', 'Education_-_BTech_0']

First few rows of first CSV:
                                   data-page-selector  \
0  https://www.iiit-bh.ac.in/faculty/satyajit-pan...   
1  https://www.iiit

In [2]:
from langchain_core.documents import Document
import re, os
import pandas as pd

folder_path = r"C:\Users\Shaur\OneDrive\Desktop\docs\college\faculty_data"
documents = []

for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        csv_path = os.path.join(folder_path, file)
        df = pd.read_csv(csv_path)
        first_col_name = df.columns[0]

        for idx, row in df.iterrows():
            # 1. Extract the first column's value for Metadat
            first_col_value = row[first_col_name]
            other_cols = df.columns[1:]
            row_text = ", ".join([f"{col}: {row[col]}" for col in other_cols])
            row_text = re.sub(r"\s+", " ", row_text).strip()

            doc = Document(
                page_content=row_text,
                metadata={
                    "row_index": idx,
                    "source": file,
                    first_col_name: first_col_value  # Specifically storing the first column here
                }
            )
            documents.append(doc)

print(f"Total documents created: {len(documents)}")

Total documents created: 315


In [3]:
# 1. Calculate lengths
char_lengths = [len(doc.page_content) for doc in documents]
word_lengths = [len(doc.page_content.split()) for doc in documents]

# 2. Compute Averages
if documents:
    avg_char = sum(char_lengths) / len(documents)
    avg_word = sum(word_lengths) / len(documents)

    print(f"Total Chunks: {len(documents)}")
    print(f"Average Character Length: {avg_char:.2f}")
    print(f"Average Word Count: {avg_word:.2f}")
    print(f"Max Word Count: {max(word_lengths)}")
    print(f"Min Word Count: {min(word_lengths)}")
else:
    print("No documents found.")

Total Chunks: 315
Average Character Length: 545.65
Average Word Count: 68.37
Max Word Count: 380
Min Word Count: 5


In [4]:
a = documents[3].metadata
b = documents[3].page_content
print(len(b))
print(a)
print(b)





504
{'row_index': 3, 'source': 'iiit-bhubneswar.csv', 'data-page-selector': 'https://www.iiit-bh.ac.in/faculty/dr-santisudha-panigrahi/'}
professor: Dr. Santisudha Panigrahi, position: Assistant Professor, email: Email Id: santisudha@iiit-bh.ac.in, department: Department of Computer Science, contact: Contact no.: 7387094590, Research_Interests_0: Dr. Santisudha's Teaching interests are AI/ML,Python, Java, C++ Programming Languages, Software Engineering, and Research interests include AI/ML, Bioinformatics, Image Processing, Computer Vision, Education_-_PhD_0: PhD in Computer Science, Education_-_MTech_0: Mtech, Education_-_BTech_0: BE


In [3]:
import uuid
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_core.stores import InMemoryStore
from langchain_community.vectorstores import Chroma

# ================= PATH =================
CHROMA_PATH = r"D:\AIAGENTS\chromadb_"
os.makedirs(CHROMA_PATH, exist_ok=True)

# 1. Initialize the Embedding Model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# 2. Setup the Storage Layers (PERSISTENT)
vectorstore = Chroma(
    collection_name="faculty_data",
    embedding_function=embeddings,
    persist_directory=CHROMA_PATH
)

# Docstore holds the full parent documents (still in-memory)
store = InMemoryStore()
id_key = "doc_id"

# 3. Initialize the Retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# ðŸ”¹ IMPORTANT: fetch more child chunks to ensure 10 parents
retriever.search_kwargs = {"k": 50}

def index_data(retriever, documents):
    print(f"ðŸš€ Processing {len(documents)} records into Multi-Vector store...")

    for parent_doc in documents:
        parent_id = str(uuid.uuid4())

        column_pairs = parent_doc.page_content.split(", ")

        child_docs = []
        for pair in column_pairs:
            child_docs.append(
                Document(
                    page_content=pair,
                    metadata={id_key: parent_id}
                )
            )

        # Store full parent document
        retriever.docstore.mset([(parent_id, parent_doc)])

        # Store child embeddings
        retriever.vectorstore.add_documents(child_docs)

    # Persist embeddings
    retriever.vectorstore.persist()

    print("âœ… Indexing Complete and embeddings persisted to disk.")

# Run indexing
index_data(retriever, documents)

# ================= RETRIEVAL =================

def retrieve_top_10(retriever, query: str):
    """
    Returns top 10 FULL parent documents using MultiVectorRetriever
    """
    docs = retriever.invoke(query)   # NEW API
    return docs[:10]



  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(


ðŸš€ Processing 315 records into Multi-Vector store...
âœ… Indexing Complete and embeddings persisted to disk.


  retriever.vectorstore.persist()


In [4]:
from flashrank import Ranker, RerankRequest

# Initialize FlashRank model
reranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")

def retrieve_top_5_with_flashrank(retriever, query: str):
    """
    Uses MultiVectorRetriever + FlashRank
    Returns top 5 reranked parent documents
    """

    # Step 1: Get parent docs from MultiVectorRetriever
    parent_docs = retriever.invoke(query)

    # Safety: if fewer docs
    if len(parent_docs) <= 5:
        return parent_docs

    # Step 2: Prepare rerank request
    passages = [
        {"id": i, "text": doc.page_content}
        for i, doc in enumerate(parent_docs)
    ]

    rerank_request = RerankRequest(
        query=query,
        passages=passages
    )

    # Step 3: Rerank
    reranked = reranker.rerank(rerank_request)

    # Step 4: Select top 5 documents
    top_5_docs = []
    for item in reranked[:5]:
        top_5_docs.append(parent_docs[item["id"]])

    return top_5_docs





In [5]:
query = "faculty in iiitnr"

top_5_docs = retrieve_top_5_with_flashrank(retriever, query)

for i, doc in enumerate(top_5_docs, 1):
    print(doc.page_content)

designation: nan, email: aruna@iiitnr.edu.in, profile_link: https://www.iiitnr.ac.in/node/2481, raw_text_snippet: Faculty (Arranged Alphabetically) Abhishek Sharma ( ASSISTANT PROFESSOR, ECE ) Qualification : Ph. D. (IIT Guwahati) Department : ECE Email Id : abhishek[at]iiitnr.edu.in Contact No : 91-771-2474031 Research : Bio-metrics, Online Signature Verification, Write identification, Image Processing, Patter
designation: nan, email: aruna@iiitnr.edu.in, profile_link: https://www.iiitnr.ac.in/node/2481, raw_text_snippet: Abhishek Sharma ( ASSISTANT PROFESSOR, ECE ) Qualification : Ph. D. (IIT Guwahati) Department : ECE Email Id : abhishek[at]iiitnr.edu.in Contact No : 91-771-2474031 Research : Bio-metrics, Online Signature Verification, Write identification, Image Processing, Pattern Recognition and Machine Learning
designation: nan, email: vinay@iiitnr.edu.in, profile_link: https://www.iiitnr.ac.in/node/3301, raw_text_snippet: Vinay Kumar ( Assistant Professor, CSE ) Qualification :

In [20]:
from typing import List
from langchain.tools import tool
from langchain_core.documents import Document
from flashrank import Ranker, RerankRequest

# Initialize FlashRank once
reranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")

@tool("flashrank")
def flashrank(query: str) -> List[Document]:
    """
    Retrieve documents using MultiVectorRetriever
    and rerank them using FlashRank.
    Returns top 5 documents.
    """

    # Step 1: Retrieve parent documents
    parent_docs = retriever.invoke(query)

    if not parent_docs:
        return []

    if len(parent_docs) <= 5:
        return parent_docs

    # Step 2: Prepare passages
    passages = [
        {"id": i, "text": doc.page_content}
        for i, doc in enumerate(parent_docs)
    ]

    rerank_request = RerankRequest(
        query=query,
        passages=passages
    )

    # Step 3: Rerank
    reranked = reranker.rerank(rerank_request)

    for rank, item in enumerate(reranked[:5]):
        doc = parent_docs[item["id"]]
        doc.metadata["rerank_rank"] = rank + 1
        doc.metadata["source"] = "vectorstore"
        top_5_docs.append(doc)


    return top_5_docs


In [10]:
from tavily import TavilyClient
from langchain_core.tools import tool

# Initialize Tavily client
tavily_client = TavilyClient(
    api_key="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"
)
def tavily_search(query: str) -> str:
    """
    Search the web using Tavily and return summarized results.
    """
    response = tavily_client.search(
        query=query,
        search_depth="advanced",
        max_results=5
    )

    # Convert to readable text for LLMs
    results = []
    for r in response.get("results", []):
        results.append(
            f"Title: {r.get('title')}\n"
            f"Content: {r.get('content')}\n"
            f"URL: {r.get('url')}\n"
        )

    return "\n---\n".join(results)


In [28]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

# Initialize Wikipedia tool (once, globally)
wiki = WikipediaQueryRun(
    api_wrapper=WikipediaAPIWrapper(
        top_k_results=2,
        doc_content_chars_max=1000
    )
)

def wikipedia_node(state):
    """
    LangGraph node for Wikipedia lookup.
    Uses Wikipedia transiently and does NOT store results in state.
    """
    print("---WIKIPEDIA LOOKUP---")

    # Get latest query
    query = state["messages"][-1].content

    # Call Wikipedia
    result = wiki.invoke(query)

    if not result:
        return {}

    # Inject temporary context into messages ONLY
    return {
        "messages": [
            HumanMessage(
                content=f"""
Wikipedia context (use only if relevant):

{result}

User question:
{query}
"""
            )
        ]
    }


In [11]:
from langgraph.graph import END, StateGraph
from typing import TypedDict, List, Dict
from langchain_core.messages import HumanMessage
from langchain_groq import ChatGroq
from langchain_core.messages import BaseMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
import os


In [None]:
class ResearchAgentState(TypedDict):
    messages: List[BaseMessage]
    intent: str
    memory: List[Dict[str, str]]
    retrieved_docs: List[Document]
    relevance_score: float
    iteration_count: int


In [16]:
def query_analyzer(state):
    """
    Analyze the user query and normalize it using Gemini.
    """
    print("---ANALYZE QUERY---")
    question = state["messages"][0].content

    msg = [
        HumanMessage(content=f"""
Analyze the following query and clean it if needed.

{question}

Return a concise, clear research-oriented question.
""")
    ]


    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI"),
        temperature=0
)

    

    response = model.invoke(msg)

    return {
        "messages": [response],
        "iteration_count": 0
    }


In [17]:
def memory_check(state):
    """
    Check conversation memory and retrieve relevant past information
    for follow-up queries.
    """
    print("---CHECK MEMORY---")

    question = state["messages"][0].content
    memory = state.get("memory", [])

    # No memory â†’ no update
    if not memory:
        return {}

    msg = [
        HumanMessage(
            content=f"""
You are given a follow-up question and a list of past interactions.

Follow-up question:
{question}

Conversation memory:
{memory}

From the memory, extract ONLY the information that is directly
relevant to answering the follow-up question.

If nothing is relevant, return: NONE
Otherwise, return the relevant information concisely.
"""
        )
    ]

    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI"),
        temperature=0
    )

    response = model.invoke(msg)

    if response.content.strip().upper() == "NONE":
        return {}

    # Return PARTIAL update only
    return {
        "messages": [
            HumanMessage(
                content=f"""
Context from previous interactions:
{response.content}

Current question:
{question}
"""
            )
        ]
    }


In [35]:
def intent_classifier(state):
    """
    Classify the intent of the query using Gemini.
    """
    print("---CLASSIFY INTENT---")

    question = state["messages"][0].content

    msg = [
        HumanMessage(
            content=f"""
Classify the intent of the following query as one of:
- new_search
- refinement
- follow_up

Query:
{question}

Return ONLY one label.
"""
        )
    ]

    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI"),
        temperature=0.3
    )

    response = model.invoke(msg)

    intent = response.content.strip().lower()

    # Guardrail: enforce valid labels
    if intent not in {"new_search", "refinement", "follow_up"}:
        intent = "new_search"

    # Return PARTIAL state update (do NOT return state)
    return {
        "intent": intent
    }


In [37]:
def intent_router(state):
    """
    Route execution based on classified intent.
    """
    intent = state.get("intent", "new_search")

    if intent == "new_search":
        return "new_search"
    elif intent == "refinement":
        return "refinement"
    else:
        return "follow_up"


In [30]:
def retrieve_with_flashrank(state):
    """
    LangGraph node: calls FlashRank tool and updates state
    """
    print("---RETRIEVE WITH FLASHRANK---")

    query = state["messages"][0].content

    docs = flashrank.invoke(query)

    if not docs:
        return {}

    return {
        "retrieved_docs": docs,
        "iteration_count": state["iteration_count"] + 1
    }


In [21]:
from langchain.tools import tool
from langchain_core.messages import HumanMessage
from langchain_groq import ChatGroq

@tool("semantic_scholar_search")
def semantic_scholar_search(query: str) -> str:
    """
    Fetch important academic papers related to a query.
    This tool is stateless and does NOT update agent state.
    """

    print("---SEMANTIC SCHOLAR TOOL---")

    msg = [
        HumanMessage(content=f"""
List important academic papers related to:

{query}

Return a concise list.
""")
    ]

    model = ChatGroq(model="qwen-qwq-32b")
    response = model.invoke(msg)

    return response.content


In [36]:
import os
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

def check_relevance(state):
    """
    Judge relevance of retrieved information.
    """
    print("---CHECK RELEVANCE---")

    results = state.get("retrieved_docs", [])

    msg = [
        HumanMessage(
            content=f"""
Evaluate the relevance (0 to 1) of the following retrieved information:

{results}

Return ONLY a floating-point number between 0 and 1.
"""
        )
    ]

    model = ChatOpenAI(
        model="deepseek-chat",
        api_key=os.getenv("DEEPSEEK_API_KEY"),
        base_url="https://api.deepseek.com",
        temperature=0
    )

    response = model.invoke(msg)

    try:
        score = float(response.content.strip())
    except ValueError:
        score = 0.0  # safety fallback

    return {"relevance_score": score}


In [38]:
def relevance_router(state):
    """
    Decide whether to refine or generate response.
    Max refinement iterations = 2.
    """
    score = state.get("relevance_score", 0.0)
    iteration = state.get("iteration_count", 0)

    if score >= 0.6:
        return "generate"

    if iteration >= 2:
        return "generate"

    return "refine"


In [23]:
import os
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

def refine_query(state):
    """
    Rewrite the query if relevance is low.
    """
    print("---REFINE QUERY---")

    question = state["messages"][0].content

    msg = [
        HumanMessage(
            content=f"""
Rewrite the following query to be more specific for faculty research matching.
Do NOT add explanations.
Return ONLY the rewritten query.

Query:
{question}
"""
        )
    ]

    model = ChatOpenAI(
        model="deepseek-chat",
        api_key=os.getenv("DEEPSEEK_API_KEY"),
        base_url="https://api.deepseek.com",
        temperature=0.5
    )

    response = model.invoke(msg)

    refined_query = response.content.strip()

    # Return PARTIAL update (no state mutation)
    return {
        "messages": [HumanMessage(content=refined_query)],
        "iteration_count": state["iteration_count"] + 1
    }


In [24]:
def generate_response(state):
    """
    Generate final answer for the user.
    """
    print("---GENERATE RESPONSE---")
    results = state.get("search_results", {})

    msg = [
        HumanMessage(content=f"""
Based on the following research data, generate a concise faculty recommendation:

{results}
""")
    ]

    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI"),
        temperature=0.3
    )
    response = model.invoke(msg)
    return {"messages": [response]}


In [33]:
def update_memory(state):
    """
    Store the latest interaction in memory.
    """
    print("---UPDATE MEMORY---")

    messages = state["messages"]
    memory = state.get("memory", [])

    # Extract last user question and assistant answer
    user_msg = None
    assistant_msg = None

    for msg in reversed(messages):
        if isinstance(msg, AIMessage) and assistant_msg is None:
            assistant_msg = msg.content
        elif isinstance(msg, HumanMessage) and user_msg is None:
            user_msg = msg.content
        if user_msg and assistant_msg:
            break

    # If incomplete, do nothing
    if not user_msg or not assistant_msg:
        return {}

    return {
        "memory": memory + [
            {
                "question": user_msg,
                "answer": assistant_msg
            }
        ]
    }


In [39]:
from langgraph.graph import StateGraph, END

graph = StateGraph(ResearchAgentState)

# ===== Nodes =====
graph.add_node("query_analyzer", query_analyzer)
graph.add_node("memory_check", memory_check)
graph.add_node("intent_classifier", intent_classifier)

graph.add_node("retrieve_with_flashrank", retrieve_with_flashrank)

# External knowledge (non-persistent)
graph.add_node("wikipedia", wikipedia_node)

graph.add_node("check_relevance", check_relevance)
graph.add_node("refine_query", refine_query)

graph.add_node("generate_response", generate_response)
graph.add_node("update_memory", update_memory)

# ===== Edges =====

# START â†’ Analyzer â†’ Memory â†’ Intent
graph.set_entry_point("query_analyzer")
graph.add_edge("query_analyzer", "memory_check")
graph.add_edge("memory_check", "intent_classifier")

# Intent branching
graph.add_conditional_edges(
    "intent_classifier",
    intent_router,
    {
        "new_search": "retrieve_with_flashrank",
        "refinement": "retrieve_with_flashrank",
        "follow_up": "retrieve_with_flashrank",
    }
)

# Retrieval â†’ External context â†’ Relevance
graph.add_edge("retrieve_with_flashrank", "wikipedia")
graph.add_edge("wikipedia", "check_relevance")

# Relevance decision
graph.add_conditional_edges(
    "check_relevance",
    relevance_router,
    {
        "generate": "generate_response",
        "refine": "refine_query",
    }
)

# Refinement loop (max 2 enforced)
graph.add_edge("refine_query", "retrieve_with_flashrank")

# Finalization
graph.add_edge("generate_response", "update_memory")
graph.add_edge("update_memory", END)

app = graph.compile()


In [40]:
from langchain_core.messages import HumanMessage

initial_state = {
    "messages": [HumanMessage(content="faculty working on computer vision at IIIT Naya Raipur")],
    "intent": "",
    "memory": [],
    "retrieved_docs": [],
    "relevance_score": 0.0,
    "iteration_count": 0
}


In [41]:
final_state = app.invoke(initial_state)


---ANALYZE QUERY---




KeyboardInterrupt: 

In [None]:
from langchain_core.messages import AIMessage

for msg in reversed(final_state["messages"]):
    if isinstance(msg, AIMessage):
        print(msg.content)
        break
