In [16]:
from typing import TypedDict, List, Annotated, Union
from langgraph.graph.message import add_messages
from google import genai
import dotenv, os
from langgraph.graph import StateGraph, END, START
from model.db import chroma_client
from google.genai import types
import requests, json
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage


dotenv.load_dotenv()

True

In [17]:
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
MODEL = "gemini-2.0-flash"
llm = ChatOllama(
    model="llama3.2",temperature=0.7
)
server_url = "http://147.93.29.19:9876"

In [18]:
class SessionState(TypedDict):
    id: str ## unique identifier for the session as well as collection name
    session_name: str ## name of the session
    
    feedback: Annotated[List[dict], add_messages]
    messages: Annotated[List[dict], add_messages]
    
    invoke: str

In [19]:
def interface_agent(state: SessionState):
    print(state.get("invoke"))
    if state.get("invoke") == "message":
        state["invoke"] = "classify"
        return {
            "next_nodes": ["user_message_classifier"],  # Specify the next node(s)
            "state": state  # Return the updated state
        }
    
    if state.get("invoke") == "ExtractData":
        state["invoke"] = "schema_definer"
        return {
            "next_nodes": ["schema_definer"],  # Specify the next node(s)
            "state": state  # Return the updated state
        }
        
    if state.get("invoke") == "done":
        return {
            "next_nodes": [END],  # Indicate the end of the graph
            "state": state  # Return the updated state
        }
        
    return {
        "next_nodes": [END],  # End the graph if no valid invoke value is found
        "state": state  # Return the state as is
    }

In [20]:
def user_message_classifier(state: SessionState):
    system_prompt = '''Role: You are a classifier agent in an agentic AI system. Your task is to analyze user queries and determine the correct action the system should take. You must classify the intent of each user message into one of the following categories:

    Classification Categories:
    
    SummarizeContext
    The user is asking for a summary of the current context or conversation. Mention in pointswise and easily understandable format. The summary should be mentioned in reason field
    ➤ Examples:
    "Can you summarize our conversation so far?"
    "What have we discussed in this session?"
    "Summarize the key points from our chat."
    
    UpdateContext
    The user wants to store information in the agent's internal state for future reference.
    ➤ Examples:
    "The client's name is John."
    "We're organizing a conference on June 5th."
    "Add this to our team knowledge."

    SuggestOrCreateSchema
    The user is requesting help designing a schema or structured format to extract information.
    Design a prompt for the next agent to create a schema based on the user's request. mention it in the reason field.
    ➤ Examples:
    "Can you create a schema for extracting details from job applications?"
    "Suggest a structure to capture meeting notes."
    "What format should I use to store bug report info?"

    ExtractData
    The user provides a schema and wants you to apply it to extract structured data from text.
    design the shcema in the form of json string based on the user request
    ➤ Examples:
    "Here's a schema: {name, date, location}. Extract this from the paragraph below."
    "Use this format and pull details from the email."
    "extract data with fileds including title, name and age"
    "Apply this structure: {title, author, summary} to the following.
    
    ## Output Format:
Return your response as a JSON object with the following fields:

{
  "classification": "UpdateContext" | "SuggestOrCreateSchema" | "ExtractData" | "SummarizeContext",
  "reason": "Brief explanation to use it as a prompt for the next agent",
  "schema": "The schema to be used for the next agent(used in ExtractData)",
  "update_context": "The context to be updated in the agent's internal state(used in UpdateContext)",
}
'''
    
    query = state["messages"][-1].content
    
    response = client.models.generate_content(
        model=MODEL,
        config=types.GenerateContentConfig(system_instruction=system_prompt),
        contents=query
    )
    
    
    response = response.text
    response = json.loads(response[7:-3])
    val = response["classification"]
    state["invoke"] = val
    result = "create a schema to extract event name and date"
    
    if state["invoke"] == "UpdateContext":
        result = response["update_context"]
    elif state["invoke"] == "SuggestOrCreateSchema":
        result = response["reason"]
    elif state["invoke"] == "SummarizeContext":
        result = response["reason"]
    elif state["invoke"] == "ExtractData":
        result = response["schema"]
        
    return { **state, "invoke": state["invoke"], "feedback": [{"role": "assistant", "content": result}] }

In [None]:
def schema_definer(state: SessionState):
    system_prompt = '''Role:
    You are a Schema Design Agent operating within an agentic AI flow. Your primary responsibility is to analyze markdown-formatted data parsed from diverse file types (e.g., CSVs, PDFs, JSON, PowerPoint slides, and others), understand its structure and semantics, and produce a well-formed structured schema that accurately represents the data model. Focus on designing industry standard schemas
    The human field specifies the query and the user field specifies the processed data from which data needs to be extracted
    Objective:
    From the given markdown content, extract entities, fields, relationships, and data types. Then output a structured schema in a clean, standardized format (preferably JSON Schema, Prisma model, or any custom internal format as instructed).

    Instructions:
    Parse for structure:
    Understand the data's tabular, hierarchical, or semantic layout. Recognize headings, bullet lists, tables, and key-value formats in the markdown.
    
    Identify Entities and Fields:
    Identify all possible entities (tables/objects/classes).
    For each entity, detect relevant fields/attributes, along with their data types and descriptions (if implied).
    Include primary keys, foreign keys, or any obvious relationships.

    Infer Data Types:
    Infer primitive types (string, number, boolean, date, enum, object, array) from the content or examples.

    Preserve Naming Semantics:
    Use semantic, human-readable names where available. Normalize naming conventions (e.g., camelCase, snake_case) depending on the final output format.

    Output Format:
    Return the schema in a structured format (e.g., JSON Schema, Prisma schema, or custom format as specified in the query context).

    Handle Ambiguities Gracefully:
    When uncertain, make an informed guess and add comments or flags for human review.

    No assumptions from external context:
    Only rely on what's present in the provided markdown. Don't hallucinate fields or metadata not evident in the data.

    Example Inputs:
    Tables from parsed CSVs embedded in markdown

    Bullet points representing properties of a system (from PDFs)

    Nested key-value blocks (from JSON)

    Slide notes formatted as markdown (from PowerPoint)

    Example Output:
    {
      "entities": [
        {
          "name": "User",
          "fields": [
            {"name": "id", "type": "string", "description": "Unique identifier"},
            {"name": "email", "type": "string"},
            {"name": "signupDate", "type": "date"}
          ]
        },
        {
          "name": "Order",
          "fields": [
            {"name": "orderId", "type": "string"},
            {"name": "userId", "type": "string", "relation": "User.id"},
            {"name": "amount", "type": "number"}
          ]
        }
      ]
    }'''
    
    response = requests.post(
      server_url + "/searchData",
      headers={"Content-Type": "application/json"},
      data=json.dumps({
          "collection_id": state["id"],
          "text": state["feedback"][-1].content,
          "n_results": 3,
        }),
    )
    
    message = {
      "context": response.json()["results"][0]["text"],
      "user": state["feedback"][-1].content,
    }
    
    result = client.models.generate_content(
        model=MODEL,
        config=types.GenerateContentConfig(system_instruction=system_prompt),
        contents=str(message)
    )
    result = result.text[7:-3].replace("\n", ' ')
    
    state["invoke"] = "done"
    
    return { **state, "invoke": state["invoke"], "feedback": [{"role": "assistant", "content": result}] }

In [22]:
graph_builder = StateGraph(SessionState)

In [23]:
graph_builder.add_node("interface_agent", interface_agent)
graph_builder.add_node("user_message_classifier", user_message_classifier)
graph_builder.add_node("schema_definer", schema_definer)

graph_builder.set_entry_point("interface_agent")
graph_builder.add_conditional_edges(
    "interface_agent",
    lambda state: interface_agent(state)["next_nodes"],  # Extract next_nodes
    {
        "user_message_classifier": "user_message_classifier",
        "schema_definer": "schema_definer",
        END: END
    },
)
graph_builder.add_edge("user_message_classifier", "interface_agent")
graph_builder.add_edge("schema_definer", "interface_agent")

<langgraph.graph.state.StateGraph at 0x7323061d1ed0>

In [24]:
graph = graph_builder.compile()

In [25]:
sample_input = {
    "id": "33dc47d9b2ed42f4b769c3d225ea2d4c",
    "session_name": "Test Session",
    "feedback": [],
    "messages": [
        # {"role": "user", "content": "suggest me a schema to extract information from the given data"},
        # {'role': 'user', 'content': 'Can you add a new event with name "Annual Meeting", date "2023-10-15", and location "New York"?'},
                {"role": "user", "content": "extract the event details from the given data, including the event name, date, and location."},
    ],
    "invoke": "message"
}

In [26]:
output = graph.invoke(sample_input)

message
message
ExtractData
ExtractData
done
done


In [30]:
schema = output["feedback"][-1].content

In [34]:
json.loads(schema.replace("\n", ' '))

{'entities': [{'name': 'EventRegistration',
   'description': 'Represents the registration and participation data for various events.',
   'fields': [{'name': 'eventName',
     'type': 'string',
     'description': 'Name of the event',
     'primaryKey': True},
    {'name': 'teamReg',
     'type': 'integer',
     'description': 'Number of team registrations'},
    {'name': 'teamPart',
     'type': 'integer',
     'description': 'Number of team participants'},
    {'name': 'externalReg',
     'type': 'integer',
     'description': 'Number of external registrations'},
    {'name': 'externalPart',
     'type': 'integer',
     'description': 'Number of external participants'},
    {'name': 'firstReg',
     'type': 'integer',
     'description': 'Number of registrations in the first round'},
    {'name': 'firstPart',
     'type': 'integer',
     'description': 'Number of participants in the first round'},
    {'name': 'secondReg',
     'type': 'integer',
     'description': 'Number of regis