In [107]:
def system_prompt() -> str:
    return """You are an advanced AI agent designed for forensic and compliance investigations, specializing in analyzing large email datasets. Your task is to investigate multiple accusations simultaneously, searching for evidence, extracting relevant information, and drawing conclusions. You have access to a SemanticHybridSearch tool that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.

Key Responsibilities:
- Generate and refine search queries for multiple accusations, providing both Elasticsearch queries and semantic search strings.
- Analyze search results to extract relevant information.
- Evaluate evidence to determine if it supports or refutes accusations.
- Generate conclusions based on the accumulated evidence.

Guidelines:
- Maintain objectivity and avoid bias in your analysis.
- Consider the context and relationships between different pieces of information.
- Be thorough in your investigation, but also efficient in your search refinement.
- Clearly distinguish between facts, inferences, and speculations in your reports.
- Adapt your search and analysis strategies based on the unique aspects of each accusation.
- Utilize both lexical (Elasticsearch) and semantic (Faiss) search capabilities effectively.

You will be provided with specific instructions for each task. Always strive for accuracy, clarity, and relevance in your responses.
"""


def initial_query_prompt() -> str:
    return """Task: Generate initial search queries for the following accusation, suitable for use with the SemanticHybridSearch tool.

Accusation: {accusation_prompt}
Response Format: Provide the response in JSON format with the following keys:
elastic: Contains the Elasticsearch query in JSON format.
semantic: Contains the semantic search query as a string.

Guidelines:
Unionized Search Approach:
- Combine Elasticsearch and semantic search capabilities effectively. For example: Use Elasticsearch to filter specific fields (e.g., recipients, senders). Use semantic search to refine or specify the context within filtered results.
- If only one type of search is required, leave the other key empty (e.g., {{}} for elastic or "" for semantic).

Data Schema:
{{
  "Subject": "Subject of mail",
  "To": "All Recipients",
  "From": "Name of sender",
  "Cc": "All CC",
  "Bcc": "All BCC",
  "Date": "Date in datetime format",
  "Attachment_Count": "Number of attachments",
  "Mail_Body": "Content of the mail in plain text format"
}}

Elasticsearch Query:
- Focus on key terms and concepts relevant to the accusation.
- Use appropriate Elasticsearch query DSL structures (e.g., bool, must, should, match, term).
- Consider field-specific searches (e.g., subject, body, from, to) and apply boosts where necessary.
- Ensure queries are broad enough to capture relevant information but specific enough to exclude irrelevant results.

Semantic Search Query:
- Use natural language to describe the context and meaning of the accusation.
- Incorporate synonyms, related terms, and broader concepts to capture nuances beyond simple keywords.

Efficiency and Contextual Relevance:
- Adapt search strategies based on the unique aspects of each accusation.
- Ensure objectivity and avoid bias in query generation.
- Clearly distinguish between facts, inferences, and speculations.

Output Example:
{{
  "elastic": {{
    // Elasticsearch query here
  }},
  "semantic": "Semantic search string here"
}}

Do not provide a preamble or an explanation, the output should strictly be in JSON format with no comments"""  # Pass


def refine_search_prompt() -> str:
    return """Task: Refine the search queries based on the current queries and extracted information to uncover more details about the accusation. Provide refined queries for both Elasticsearch and semantic search.

Current Elasticsearch Query: {elastic_query}
Current Semantic Query: {semantic_query}
Extracted Info Summary: {info}
Areas for Further Investigation: {areas}
Accusation: {accusation_prompt}

Guidelines:
Unionized Search Approach:
- Combine Elasticsearch and semantic search capabilities effectively. For example: Use Elasticsearch to filter specific fields (e.g., recipients, senders). Use semantic search to refine or specify the context within filtered results.
- If only one type of search is required, leave the other key empty (e.g., {{}} for elastic or "" for semantic).

Data Schema:
{{
  "Subject": "Subject of mail",
  "To": "All Recipients",
  "From": "Name of sender",
  "Cc": "All CC",
  "Bcc": "All BCC",
  "Date": "Date in datetime format",
  "Attachment_Count": "Number of attachments",
  "Mail_Body": "Content of the mail in plain text format"
}}

Your refined queries should:
- Build upon the insights gained from the extracted information.
- Focus on areas where evidence is lacking or inconclusive.
- Include any new relevant terms or concepts discovered in the previous search.
- Be more specific than the initial queries, targeting the most promising areas for further investigation.
- Utilize Elasticsearch-specific features for the lexical query and natural language for the semantic query.

Refined Search Queries:

{{
  "elastic": {{
    // Elasticsearch query here
  }},
  "semantic": "Semantic search string here"
}}

Do not provide a preamble or an explanation, the output should strictly be in JSON format with no comments
"""  # Pass


def information_extraction_prompt() -> str:
    return """Task: Extract relevant information from the hybrid search results related to the following accusation:

Accusation: {accusation_prompt}

Hybrid Search Results:
{results}

Analyze the results, which combine Elasticsearch and Faiss search outcomes. Each result contains fields like "Subject", "To", "From", "Cc", "Bcc", "Date", "Attachment_Count", and "Mail_Body".

Provide the following information in JSON format:

{{
  "accused_suspects": [],
  "incident_details": {{
    "events": [
      {{
        "details": "",
        "description": "",
        "date": "",
        "uid":"",
      }}
    ]
  }},
  "other_parties": {{
    "name": {{
      "relationship": "",
      "role": "",
      "uid":"uid",
    }}
  }},
  "summary": ""
}}

Ensure all relevant information is included within this structure. Omit any explanations or additional text outside the JSON.
"""  # Pass


def analyze_evidence_prompt() -> str:
    return """Task: Analyze the extracted information and determine if it provides sufficient evidence for the accusation. If not, suggest areas for further investigation.

Accusation: {accusation_prompt}

Extracted Information:
{info}

Summary of Previous Information:
{summary}

Provide your analysis in the following JSON format:

{{
  "credibility_and_reliability": {{
    "events_analysis": [
      {{
        "event": "Description of the event",
        "credibility_score": "Score from 0-100",
        "reasoning": "Explanation for the credibility score",
        "uid": "The uid of the source where event is mentioned"
      }}
    ],
    "relationships_analysis": [
      {{
        "entity1": "Name of first entity",
        "entity2": "Name of second entity",
        "relationship": "Description of relationship",
        "credibility_impact": "How this relationship affects credibility",
        "uid": "The uid of the source where entities are mentioned"
      }}
    ],
    "overall_credibility_assessment": "Summary of overall credibility"
  }},
  "sufficiency": {{
    "conclusion": "One of: sufficient, partial, insufficient",
    "confidence_score": "Score from 0-100",
    "conclusion_statement": "Detailed explanation of the sufficiency conclusion",
    "refrences": ["List of the uids referenced"]
  }},
  "areas_for_further_investigation": [
    "List of specific areas or questions needing further investigation"
  ]
}}

Ensure all relevant analysis is included within this structure. Omit any explanations or additional text outside the JSON.
"""

In [147]:
import json
from typing import Dict, List, Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolExecutor

class State(TypedDict):
    accusation: str
    queries:Dict[Dict, str]
    search_results: List[Dict]
    extracted_info: Dict
    analysis: Dict
    search_count: int


class SemanticHybridSearch:
    def search(self, elastic_query: Dict, semantic_query: str) -> List[Dict]:
        print("Queries:", elastic_query, semantic_query)
        return [{"Possible Data": "Content"}]

class DummyLLM:
    def invoke(self, code: str):
        if code == "init":
            return '{"elastic": {"elatic_query": "init"}, "semantic": "Semantic Init"}'

        if code == "info":
            return '{"info": "Extracted Info"}'
        
        if code == "analyze":
            return '{"sufficiency": {"conclusion" : "insufficient"}, "areas_for_further_investigation": []}'


class InvestigationAgent:
    def __init__(self):
        self.llm = DummyLLM()
        self.search_tool = SemanticHybridSearch()
        self.workflow = self._create_workflow()

    def _create_workflow(self) -> StateGraph:
        workflow = StateGraph(State)
        workflow.add_node("initial_query", self.initial_query_generation)
        workflow.add_node("search", self.perform_search)
        workflow.add_node("extract_info", self.information_extraction)
        workflow.add_node("analyze", self.evidence_analysis)
        workflow.add_node("refine_query", self.refine_query)

        workflow.add_edge("initial_query", "search")
        workflow.add_edge("search", "extract_info")
        workflow.add_edge("extract_info", "analyze")
        workflow.add_conditional_edges("analyze", self.should_continue_search, {"end": END, "refine": "refine_query"})
        workflow.add_edge("refine_query", "search")
        workflow.set_entry_point("initial_query")

        return workflow.compile()

    def initial_query_generation(self, state: State) -> Dict:
        print("Started Execution: Initial Query Node")
        prompt = PromptTemplate.from_template(self._initial_query_prompt())
        human_message = HumanMessage(content=prompt.format(accusation_prompt=state['accusation']))
        ai_message = self.llm.invoke("init")
        queries = json.loads(ai_message)
        return {"queries": queries}

    def perform_search(self, state: Dict) -> Dict:
        print("Started Execution: Performing Search")
        results = self.search_tool.search(state['queries']['elastic'], state['queries']['semantic'])
        return {"search_results": results}

    def information_extraction(self, state: State) -> Dict:
        print("Started Execution: Extracting Info")
        prompt = PromptTemplate.from_template(self._information_extraction_prompt())
        human_message = HumanMessage(content=prompt.format(
            accusation_prompt=state['accusation'],
            results=json.dumps(state['search_results'])
        ))
        ai_message = self.llm.invoke("info")
        extracted_info = json.loads(ai_message)
        return {"extracted_info": extracted_info}

    def evidence_analysis(self, state: State) -> Dict:
        print("Started Execution: Analyzing Evidence")
        prompt = PromptTemplate.from_template(self._analyze_evidence_prompt())
        human_message = HumanMessage(content=prompt.format(
            accusation_prompt=state['accusation'],
            info=json.dumps(state['extracted_info']),
            summary=state.get('summary', 'None')
        ))
        ai_message = self.llm.invoke("analyze")
        analysis = json.loads(ai_message)
        return {"analysis": analysis, "search_count": state["search_count"] + 1}

    def refine_query(self, state: State) -> Dict:
        print("Started Execution: Refining Search")
        prompt = PromptTemplate.from_template(self._refine_search_prompt())
        human_message = HumanMessage(content=prompt.format(
            elastic_query=json.dumps(state['queries']['elastic']),
            semantic_query=state['queries']['semantic'],
            info=json.dumps(state['extracted_info']),
            areas=json.dumps(state['analysis']['areas_for_further_investigation']),
            accusation_prompt=state['accusation']
        ))
        ai_message = self.llm.invoke("init")
        refined_queries = json.loads(ai_message)
        return {"queries": refined_queries}

    def should_continue_search(self, state: State) -> str:
        if state['search_count'] >= 3:
            return "end"
        if state['analysis']['sufficiency']['conclusion'] == "sufficient":
            return "end"
        # if state['search_count'] > 0 and not self._significant_difference(state['previous_analysis'], state['analysis']):
        #     return "end"
        return "refine"

    def _significant_difference(self, prev_analysis: Dict, current_analysis: Dict) -> bool:
        # Implement logic to compare previous and current analysis
        # Return True if there's a significant difference, False otherwise
        pass

    def run_investigation(self, accusation: str) -> Dict:
        inputs = {
            "accusation": accusation,
            "search_count": 0,
            "previous_analysis": None
        }
        
        for output in self.workflow.stream(inputs):
            if "search_count" in output:
                output["search_count"] += 1
            if "analysis" in output:
                output["previous_analysis"] = output["analysis"]
            print(f"Output: {json.dumps(output, indent=2)}")
            print("---------------------------------------------\n")
        
        return output

    @staticmethod
    def _initial_query_prompt() -> str:
        return initial_query_prompt()

    @staticmethod
    def _refine_search_prompt() -> str:
        return refine_search_prompt()

    @staticmethod
    def _information_extraction_prompt() -> str:
        return information_extraction_prompt()

    @staticmethod
    def _analyze_evidence_prompt() -> str:
        return analyze_evidence_prompt()

In [148]:
x = InvestigationAgent()

In [149]:
x.run_investigation("Hello")

Started Execution: Initial Query Node
Output: {
  "initial_query": {
    "queries": {
      "elastic": {
        "elatic_query": "init"
      },
      "semantic": "Semantic Init"
    }
  }
}
---------------------------------------------

Started Execution: Performing Search
Queries: {'elatic_query': 'init'} Semantic Init
Output: {
  "search": {
    "search_results": [
      {
        "Possible Data": "Content"
      }
    ]
  }
}
---------------------------------------------

Started Execution: Extracting Info
Output: {
  "extract_info": {
    "extracted_info": {
      "info": "Extracted Info"
    }
  }
}
---------------------------------------------

Started Execution: Analyzing Evidence
Output: {
  "analyze": {
    "analysis": {
      "sufficiency": {
        "conclusion": "insufficient"
      },
      "areas_for_further_investigation": []
    },
    "search_count": 1
  }
}
---------------------------------------------

Started Execution: Refining Search
Output: {
  "refine_query": {

{'analyze': {'analysis': {'sufficiency': {'conclusion': 'insufficient'},
   'areas_for_further_investigation': []},
  'search_count': 3}}