In [1]:
!pip install elasticsearch
!pip install faiss-cpu sentence-transformers
!pip install langchain
!pip install -qU langchain-openai
!pip install langgraph

Collecting elasticsearch
  Downloading elasticsearch-8.17.0-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.15.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.17.0-py3-none-any.whl (571 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.2/571.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading elastic_transport-8.15.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.15.1 elasticsearch-8.17.0
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Down

# Load Elastic Search

In [2]:
from elasticsearch import Elasticsearch, helpers
from kaggle_secrets import UserSecretsClient
import pandas as pd

# Initialize Elasticsearch client
#es = Elasticsearch("http://localhost:9200/")

user_secrets = UserSecretsClient() 
# Initialize Elasticsearch client
CLOUD_ID = user_secrets.get_secret("CLOUD_ID")
ELASTIC_PASSWORD = user_secrets.get_secret("ELASTIC_PASSWORD")

es = Elasticsearch(
    cloud_id=CLOUD_ID,
    basic_auth=("elastic", ELASTIC_PASSWORD)
)
if es.ping():
    print("Connected to Elasticsearch!")
    try:
        response = es.indices.delete(index="emails")
        print(f"Successfully deleted index: emails")
    except Exception as e:
        print(f"Error deleting index/Index not found: {e}")
else:
    print("Connection failed.")

Connected to Elasticsearch!
Error deleting index/Index not found: NotFoundError(404, 'index_not_found_exception', 'no such index [emails]', emails, index_or_alias)


# Load FAISS

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True, device="cuda")

modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/169k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

# Semantic Hybrid Search

In [4]:
import faiss
import json
from elasticsearch.helpers import bulk
from difflib import SequenceMatcher
from typing import List, Dict, Any
import os
import re

class SemanticHybridSearch:
    """
    A class that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.

    This class provides methods to load and search both Elasticsearch and Faiss indices,
    as well as a hybrid search method that combines results from both search types.

    Attributes:
        data (list): The dataset used for searching.
        es_client (Elasticsearch): Elasticsearch client for performing lexical searches.
        embedding_model: Model used for encoding queries into embeddings.
        vector_index (faiss.Index): Faiss index for semantic searches.
        elastic_index_name (str): Name of the Elasticsearch index.
    """

    def __init__(
        self,
        es_client,
        embedding_model,
        data: list,
        elastic_index_path: str,
        vector_index_path: str,
    ):
        """
        Initialize the SemanticHybridSearch class.

        Args:
            es_client (Elasticsearch): Elasticsearch client.
            embedding_model: Model for encoding queries into embeddings.
            data (list): Dataset used for searching.
            elastic_index_path (str): Path to the Elasticsearch index file.
            vector_index_path (str): Path to the Faiss vector index file.
        """
        self.data = data
        self.es_client = es_client
        self.embedding_model = embedding_model
        self.vector_index = self.load_vector_index(vector_index_path)
        self.elastic_index = self.load_elastic_index(elastic_index_path)

        self.elastic_index_name = ""

    def load_elastic_index(self, elastic_index_path: str):
        """
        Load the Elasticsearch index from a file.

        Args:
            elastic_index_path (str): Path to the Elasticsearch index file.
        """
        with open(elastic_index_path) as f:
            documents = json.load(f)
            self.elastic_index_name = os.path.basename(elastic_index_path)
            print(f"Loading Index {self.elastic_index_name}")

            actions = [
                {
                    "_index": self.elastic_index_name,
                    "_id": doc["_id"],
                    "_source": doc["_source"],
                }
                for doc in documents
            ]
            bulk(self.es_client, actions)

    def load_vector_index(self, vector_index_path: str):
        """
        Load the Faiss vector index from a file.

        Args:
            vector_index_path (str): Path to the Faiss vector index file.

        Returns:
            faiss.Index: Loaded Faiss index.
        """
        print(f"Loading Index {os.path.basename(vector_index_path)}")
        index = faiss.read_index(vector_index_path)
        return index

    def elastic_search(self, query: dict, top_k: int = 3) -> list:
        """
        Perform a keyword-based search using Elasticsearch.

        Args:
            query (dict): Elasticsearch query.
            top_k (int): Number of top results to return. Defaults to 3.

        Returns:
            list: Top k search results.
        """
        try:
            results = self.es_client.search(index=self.elastic_index_name, body=query)
        except Exception as e:
            print("Invalid Query", query, e)
            return []
        return [result["_source"] for result in results["hits"]["hits"][:top_k]]

    def semantic_search(self, query: str, top_k: int = 3) -> list:
        """
        Perform a semantic search using Faiss.

        Args:
            query (str): Search query.
            top_k (int): Number of top results to return. Defaults to 3.

        Returns:
            list: Top k search results.
        """
        embedding = self.embedding_model.encode([query]).astype("float32")
        distances, idx = self.vector_index.search(embedding, top_k)
        results = [self.data[i] for i in idx[0]]

        return results

    def hybrid_search(
        self,
        elastic_query: dict,
        semantic_query: str,
        top_k: tuple = (3, 3),
        clean_overlap: bool = True,
    ) -> list:
        """
        Perform a hybrid search combining results from Elasticsearch and Faiss.

        Args:
            elastic_query (dict): Elasticsearch query for lexical search.
            semantic_query (str): Query string for semantic search.
            top_k (tuple): Tuple containing the number of top results to return for (elastic, semantic) searches. Defaults to 3.
            clean_overlap (bool): Whether to remove overlap in email threads results. Defaults to True.

        Returns:
            list: Combined and deduplicated search results.
        """
        elastic_results = self.elastic_search(elastic_query, top_k[0])
        semantic_results = self.semantic_search(semantic_query, top_k[1])

        hybrid_concat = pd.concat(
            [pd.DataFrame(elastic_results), pd.DataFrame(semantic_results)],
            ignore_index=True,
        ).drop_duplicates()
        hybrid_results = hybrid_concat.to_dict(orient="records")

        if clean_overlap:
            return self._extract_unique_content(hybrid_results)
        return hybrid_results

    def _clean_text(self, text: str) -> str:
        """
        Remove extra whitespace and newlines from the given text.

        Args:
            text (str): The input text to be cleaned.

        Returns:
            str: The cleaned text with extra whitespace removed.
        """
        return re.sub(r"\s+", " ", text).strip()

    def _find_overlap(self, text1: str, text2: str) -> str:
        """
        Find the longest common substring between two texts.

        Args:
            text1 (str): The first text to compare.
            text2 (str): The second text to compare.

        Returns:
            str: The longest common substring, or an empty string if no overlap is found.
        """
        matcher = SequenceMatcher(None, text1, text2)
        match = matcher.find_longest_match(0, len(text1), 0, len(text2))
        return text1[match.a : match.a + match.size] if match.size > 0 else ""

    def _extract_unique_content(
        self, emails: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """
        Extract unique content from a list of email dictionaries by removing overlapping text.

        This function processes a list of email dictionaries, removing any overlapping content
        between emails to reduce redundancy. It preserves the original email structure and
        metadata while modifying only the 'Mail_Body' field.

        Args:
            emails (List[Dict[str, Any]]): A list of dictionaries, each representing an email
            keys for 'Origin', 'Subject', 'To', 'From', 'Cc', 'Bcc', 'Date', 'Attachment_Count',
            and 'Mail_Body'.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries with the same structure as the input,
            but with overlapping content removed from the 'Mail_Body' field.

        Note:
            This function assumes that emails are ordered chronologically, with newer emails
            appearing later in the list.
        """
        unique_contents = []

        for i, email in enumerate(emails):
            current_email = self._clean_text(email["Mail_Body"])
            unique_content = current_email

            for j in range(i):
                previous_email = self._clean_text(emails[j]["Mail_Body"])
                overlap = self._find_overlap(previous_email, current_email)

                if len(overlap) > 10:
                    unique_content = unique_content.replace(overlap, "").strip()

            unique_contents.append(
                {
                    "Origin": email["Origin"],
                    "Subject": email["Subject"],
                    "To": email["To"],
                    "From": email["From"],
                    "Cc": email["Cc"],
                    "Bcc": email["Bcc"],
                    "Date": email["Date"],
                    "Attachment_Count": email["Attachment_Count"],
                    "Mail_Body": unique_content,
                }
            )

        return unique_contents

In [5]:
emails = pd.read_csv("/kaggle/input/llm-testing/set_tracked.csv").fillna("") # Index cannot parse nan
email_dict = emails.to_dict(orient='records')
email_dict[0]

{'Origin': '11-01-24 report simran.eml',
 'Subject': '11-01-24 report simran',
 'To': 'pushpamupadhyay.bavdhan@shapoorji.com, Pushpamupadhyay Bavdhan <pushpamupadhyay.bavdhan@shapoorji.com>',
 'From': 'Simran <simransinha6426@gmail.com>',
 'Cc': '',
 'Bcc': '',
 'Date': 'Thu, 11 Jan 2024 18:55:15 +0530',
 'Attachment_Count': 1,
 'Mail_Body': 'CAUTION: This email has originated outside of Shapoorji Pallonji. Do not click on any links or open any attachments, unless you recognize the sender and know the content is safe.',
 'UID': 0}

In [6]:
search_tool = SemanticHybridSearch(es, embedding_model, email_dict, "/kaggle/input/llm-testing/index_tracked_elastic.json", "/kaggle/input/llm-testing/index_tracked_semantic.index")

Loading Index index_tracked_semantic.index
Loading Index index_tracked_elastic.json


## Sanity Check Search Tool

In [7]:
elastic_search_query = {
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "To": {
              "query": "Pushpam",
              "fuzziness": "AUTO"
            }
          },
        }
      ]
    }
  }
} # Fuzzy search
search_tool.elastic_search(elastic_search_query, 3)
search_tool.hybrid_search(elastic_search_query, "I want to buy property", (3,3))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'Origin': 'Fwd_ Spreadsheet shared with you_ ‘Vanaha x Akkhilesh Mane Call Sheet’.eml',
  'Subject': 'Fwd: Spreadsheet shared with you: ‘Vanaha x Akkhilesh Mane Call Sheet’',
  'To': 'Pushpam Upadhyay <pushpamupadhyay91@gmail.com>',
  'From': 'Pushpamupadhyay Bavdhan <pushpamupadhyay.bavdhan@shapoorji.com>',
  'Cc': '',
  'Bcc': '',
  'Date': 'Fri, 10 May 2024 06:14:31 +0000',
  'Attachment_Count': 0,
  'Mail_Body': 'Get Outlook for AndroidFrom: Akhilesh Mane (via Google Sheets) <drive-shares-dm-noreply@google.com>Sent: Thursday, May 2, 2024 1:00:45 amTo: Pushpamupadhyay Bavdhan <pushpamupadhyay.bavdhan@shapoorji.com>Subject: Spreadsheet shared with you: ‘Vanaha x Akkhilesh Mane Call Sheet’ CAUTION: This email has originated outside of Shapoorji Pallonji. Do not click on any links or open any attachments, unless you recognize the sender and know the content is safe.Akhilesh Mane shared a spreadsheetAkhilesh Mane (am@23estates.com) added you as an editor. Verify your email address to 

# Prompts

In [8]:
def system_prompt() -> str:
    return """You are an advanced AI agent designed for forensic and compliance investigations, specializing in analyzing large email datasets. Your task is to investigate multiple accusations simultaneously, searching for evidence, extracting relevant information, and drawing conclusions. You have access to a SemanticHybridSearch tool that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.

Key Responsibilities:
- Generate and refine search queries for multiple accusations, providing both Elasticsearch queries and semantic search strings.
- Analyze search results to extract relevant information.
- Evaluate evidence to determine if it supports or refutes accusations.
- Generate conclusions based on the accumulated evidence.

Guidelines:
- Maintain objectivity and avoid bias in your analysis.
- Consider the context and relationships between different pieces of information.
- Be thorough in your investigation, but also efficient in your search refinement.
- Clearly distinguish between facts, inferences, and speculations in your reports.
- Adapt your search and analysis strategies based on the unique aspects of each accusation.
- Utilize both lexical (Elasticsearch) and semantic (Faiss) search capabilities effectively.

You will be provided with specific instructions for each task. Always strive for accuracy, clarity, and relevance in your responses.
"""


def initial_query_prompt() -> str:
    return """Task: Generate initial search queries for the following accusation, suitable for use with the SemanticHybridSearch tool.

Accusation: {accusation_prompt}
Response Format: Provide the response in JSON format with the following keys:
elastic: Contains the Elasticsearch query in JSON format.
semantic: Contains the semantic search query as a string.

Guidelines:
Unionized Search Approach:
- Combine Elasticsearch and semantic search capabilities effectively. For example: Use Elasticsearch to filter specific fields (e.g., recipients, senders). Use semantic search to refine or specify the context within filtered results.
- If only one type of search is required, leave the other key empty (e.g., {{}} for elastic or "" for semantic).

Data Schema:
{{
  "Subject": "Subject of mail",
  "To": "All Recipients",
  "From": "Name of sender",
  "Cc": "All CC",
  "Bcc": "All BCC",
  "Date": "Date in datetime format",
  "Attachment_Count": "Number of attachments",
  "Mail_Body": "Content of the mail in plain text format"
}}

Elasticsearch Query:
- Focus on key terms and concepts relevant to the accusation.
- Use appropriate Elasticsearch query DSL structures (e.g., bool, must, should, match, term).
- Consider field-specific searches (e.g., subject, body, from, to) and apply boosts where necessary.
- Ensure queries are broad enough to capture relevant information but specific enough to exclude irrelevant results.

Semantic Search Query:
- Use natural language to describe the context and meaning of the accusation.
- Incorporate synonyms, related terms, and broader concepts to capture nuances beyond simple keywords.

Efficiency and Contextual Relevance:
- Adapt search strategies based on the unique aspects of each accusation.
- Ensure objectivity and avoid bias in query generation.
- Clearly distinguish between facts, inferences, and speculations.

Output Example:
{{
  "elastic": {{
    // Elasticsearch query here
  }},
  "semantic": "Semantic search string here"
}}

Do not provide a preamble or an explanation, the output should strictly be in JSON format with no comments"""  # Pass


def refine_search_prompt() -> str:
    return """Task: Refine the search queries based on the current queries and extracted information to uncover more details about the accusation. Provide refined queries for both Elasticsearch and semantic search.

Current Elasticsearch Query: {elastic_query}
Current Semantic Query: {semantic_query}
Extracted Info Summary: {info}
Areas for Further Investigation: {areas}
Accusation: {accusation_prompt}

Guidelines:
Unionized Search Approach:
- Combine Elasticsearch and semantic search capabilities effectively. For example: Use Elasticsearch to filter specific fields (e.g., recipients, senders). Use semantic search to refine or specify the context within filtered results.
- If only one type of search is required, leave the other key empty (e.g., {{}} for elastic or "" for semantic).

Data Schema:
{{
  "Subject": "Subject of mail",
  "To": "All Recipients",
  "From": "Name of sender",
  "Cc": "All CC",
  "Bcc": "All BCC",
  "Date": "Date in datetime format",
  "Attachment_Count": "Number of attachments",
  "Mail_Body": "Content of the mail in plain text format"
}}

Your refined queries should:
- Build upon the insights gained from the extracted information.
- Focus on areas where evidence is lacking or inconclusive.
- Include any new relevant terms or concepts discovered in the previous search.
- Be more specific than the initial queries, targeting the most promising areas for further investigation.
- Utilize Elasticsearch-specific features for the lexical query and natural language for the semantic query.

Refined Search Queries:

{{
  "elastic": {{
    // Elasticsearch query here
  }},
  "semantic": "Semantic search string here"
}}

Do not provide a preamble or an explanation, the output should strictly be in JSON format with no comments
"""  # Pass


def information_extraction_prompt() -> str:
    return """Task: Extract relevant information from the hybrid search results related to the following accusation:

Accusation: {accusation_prompt}

Hybrid Search Results:
{results}

Analyze the results, which combine Elasticsearch and Faiss search outcomes. Each result contains fields like "Subject", "To", "From", "Cc", "Bcc", "Date", "Attachment_Count", and "Mail_Body".

Provide the following information in JSON format:

{{
  "accused_suspects": [],
  "incident_details": {{
    "events": [
      {{
        "details": "",
        "description": "",
        "date": "",
        "uid":"",
      }}
    ]
  }},
  "other_parties": {{
    "name": {{
      "relationship": "",
      "role": "",
      "uid":"uid",
    }}
  }},
  "summary": ""
}}

Ensure all relevant information is included within this structure. Omit any explanations or additional text outside the JSON.
"""  # Pass


def analyze_evidence_prompt() -> str:
    return """Task: Analyze the extracted information and determine if it provides sufficient evidence for the accusation. If not, suggest areas for further investigation.

Accusation: {accusation_prompt}

Extracted Information:
{info}

Summary of Previous Information:
{summary}

Provide your analysis in the following JSON format:

{{
  "credibility_and_reliability": {{
    "events_analysis": [
      {{
        "event": "Description of the event",
        "credibility_score": "Score from 0-100",
        "reasoning": "Explanation for the credibility score",
        "uid": "The uid of the source where event is mentioned"
      }}
    ],
    "relationships_analysis": [
      {{
        "entity1": "Name of first entity",
        "entity2": "Name of second entity",
        "relationship": "Description of relationship",
        "credibility_impact": "How this relationship affects credibility",
        "uid": "The uid of the source where entities are mentioned"
      }}
    ],
    "overall_credibility_assessment": "Summary of overall credibility"
  }},
  "sufficiency": {{
    "conclusion": "One of: sufficient, partial, insufficient",
    "confidence_score": "Score from 0-100",
    "conclusion_statement": "Detailed explanation of the sufficiency conclusion",
    "refrences": ["List of the uids referenced"]
  }},
  "areas_for_further_investigation": [
    "List of specific areas or questions needing further investigation"
  ]
}}

Ensure all relevant analysis is included within this structure. Omit any explanations or additional text outside the JSON.
"""


# LLM Loading

In [35]:
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain.prompts import PromptTemplate

llm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",  # or your deployment
    api_version="2024-05-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=user_secrets.get_secret("AZURE_OPENAI_API_KEY"),
    azure_endpoint=user_secrets.get_secret("AZURE_OPENAI_ENDPOINT"),
)

## LLM Sanity Check

In [36]:
messages = [
    SystemMessage(
        content="You are a helpful assistant! When someone says Hi you only say Bazoocar."
    ),
    HumanMessage(
        content="Hi"
    )
]

In [37]:
llm.invoke(messages)

AIMessage(content='Bazoocar.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 29, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_5154047bf2', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text'

In [36]:
SystemMessage(
        content= system_prompt()
    ),

(SystemMessage(content='You are an advanced AI agent designed for forensic and compliance investigations, specializing in analyzing large email datasets. Your task is to investigate multiple accusations simultaneously, searching for evidence, extracting relevant information, and drawing conclusions. You have access to a SemanticHybridSearch tool that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.\n\nKey Responsibilities:\n- Generate and refine search queries for multiple accusations, providing both Elasticsearch queries and semantic search strings.\n- Analyze search results to extract relevant information.\n- Evaluate evidence to determine if it supports or refutes accusations.\n- Generate conclusions based on the accumulated evidence.\n\nGuidelines:\n- Maintain objectivity and avoid bias in your analysis.\n- Consider the context and relationships between different pieces of information.\n- Be thorough in your investigation, but also efficien

# Graph Code

In [45]:
import json
from typing import Dict, List, Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolExecutor

class State(TypedDict):
    accusation: str
    queries:Dict[Dict, str]
    search_results: List[Dict]
    extracted_info: Dict
    analysis: Dict
    search_count: int

class InvestigationAgent:
    def __init__(self, llm):
        self.llm = llm
        self.search_tool = SemanticHybridSearch(es, embedding_model, email_dict, "/kaggle/input/llm-testing/index_tracked_elastic.json", "/kaggle/input/llm-testing/index_tracked_semantic.index")
        self.workflow = self._create_workflow()
        self.system_prompt = SystemMessage(content=system_prompt())

    def _create_workflow(self) -> StateGraph:
        workflow = StateGraph(State)
        workflow.add_node("initial_query", self.initial_query_generation)
        workflow.add_node("search", self.perform_search)
        workflow.add_node("extract_info", self.information_extraction)
        workflow.add_node("analyze", self.evidence_analysis)
        workflow.add_node("refine_query", self.refine_query)

        workflow.add_edge("initial_query", "search")
        workflow.add_edge("search", "extract_info")
        workflow.add_edge("extract_info", "analyze")
        workflow.add_conditional_edges("analyze", self.should_continue_search, {"end": END, "refine": "refine_query"})
        workflow.add_edge("refine_query", "search")
        workflow.set_entry_point("initial_query")

        return workflow.compile()

    def initial_query_generation(self, state: State) -> Dict:
        print("Started Execution: Initial Query Node")
        prompt = PromptTemplate.from_template(self._initial_query_prompt())
        human_message = HumanMessage(content=prompt.format(accusation_prompt=state['accusation']))
        ai_message = self.llm.invoke([self.system_prompt, human_message])
        print("Tokens Used")
        print(ai_message.usage_metadata['input_tokens'], ai_message.usage_metadata['output_tokens'])
        queries = json.loads(ai_message.content)
        return {"queries": queries}

    def perform_search(self, state: Dict) -> Dict:
        print("Started Execution: Performing Search")
        results = self.search_tool.hybrid_search(state['queries']['elastic'], state['queries']['semantic'], (3,3))
        return {"search_results": results}

    def information_extraction(self, state: State) -> Dict:
        print("Started Execution: Extracting Info")
        prompt = PromptTemplate.from_template(self._information_extraction_prompt())
        human_message = HumanMessage(content=prompt.format(
            accusation_prompt=state['accusation'],
            results=json.dumps(state['search_results'])
        ))
        ai_message = self.llm.invoke([self.system_prompt, human_message])
        print("Tokens Used")
        print(ai_message.usage_metadata['input_tokens'], ai_message.usage_metadata['output_tokens'])
        extracted_info = json.loads(ai_message.content)
        return {"extracted_info": extracted_info}

    def evidence_analysis(self, state: State) -> Dict:
        print("Started Execution: Analyzing Evidence")
        prompt = PromptTemplate.from_template(self._analyze_evidence_prompt())
        human_message = HumanMessage(content=prompt.format(
            accusation_prompt=state['accusation'],
            info=json.dumps(state['extracted_info']),
            summary=state["extracted_info"].get('summary', 'None')
        ))
        ai_message = self.llm.invoke([self.system_prompt, human_message])
        print("Tokens Used")
        print(ai_message.usage_metadata['input_tokens'], ai_message.usage_metadata['output_tokens'])
        analysis = json.loads(ai_message.content)
        return {"analysis": analysis, "search_count": state["search_count"] + 1}

    def refine_query(self, state: State) -> Dict:
        print("Started Execution: Refining Search")
        prompt = PromptTemplate.from_template(self._refine_search_prompt())
        human_message = HumanMessage(content=prompt.format(
            elastic_query=json.dumps(state['queries']['elastic']),
            semantic_query=state['queries']['semantic'],
            info=json.dumps(state['extracted_info']),
            areas=json.dumps(state['analysis']['areas_for_further_investigation']),
            accusation_prompt=state['accusation']
        ))
        ai_message = self.llm.invoke([self.system_prompt, human_message])
        print("Tokens Used")
        print(ai_message.usage_metadata['input_tokens'], ai_message.usage_metadata['output_tokens'])
        refined_queries = json.loads(ai_message.content)
        return {"queries": refined_queries}

    def should_continue_search(self, state: State) -> str:
        if state['search_count'] >= 2:
            return "end"
        if state['analysis']['sufficiency']['conclusion'] == "sufficient":
            return "end"
        # if state['search_count'] > 0 and not self._significant_difference(state['previous_analysis'], state['analysis']):
        #     return "end"
        return "refine"

    def _significant_difference(self, prev_analysis: Dict, current_analysis: Dict) -> bool:
        # Implement logic to compare previous and current analysis
        # Return True if there's a significant difference, False otherwise
        pass

    def run_investigation(self, accusation: str) -> Dict:
        inputs = {
            "accusation": accusation,
            "search_count": 0,
            "previous_analysis": None
        }
        
        for output in self.workflow.stream(inputs):
            start = time.time()
            if "search_count" in output:
                output["search_count"] += 1
            if "analysis" in output:
                output["previous_analysis"] = output["analysis"]
            end = time.time()
            print(f"Output: {json.dumps(output, indent=2)}")
            print(f"Took {end - start} seconds.")
            print("---------------------------------------------\n")
        
        return output

    @staticmethod
    def _initial_query_prompt() -> str:
        return initial_query_prompt()

    @staticmethod
    def _refine_search_prompt() -> str:
        return refine_search_prompt()

    @staticmethod
    def _information_extraction_prompt() -> str:
        return information_extraction_prompt()

    @staticmethod
    def _analyze_evidence_prompt() -> str:
        return analyze_evidence_prompt()

In [46]:
x = InvestigationAgent(llm)

Loading Index index_tracked_semantic.index
Loading Index index_tracked_elastic.json


In [44]:
import time

In [47]:
x.run_investigation("Employee is misbehaving, has been rude and indisciplined.")

Started Execution: Initial Query Node
Tokens Used
687 253
Output: {
  "initial_query": {
    "queries": {
      "elastic": {
        "query": {
          "bool": {
            "must": [
              {
                "bool": {
                  "should": [
                    {
                      "match": {
                        "Mail_Body": {
                          "query": "rude",
                          "boost": 2
                        }
                      }
                    },
                    {
                      "match": {
                        "Mail_Body": {
                          "query": "misbehaving",
                          "boost": 2
                        }
                      }
                    },
                    {
                      "match": {
                        "Mail_Body": {
                          "query": "indisciplined",
                          "boost": 2
                        }
                      }
        

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Output: {
  "search": {
    "search_results": [
      {
        "Origin": "Unprofessional Behavior & extension of leaves.eml",
        "Subject": "Unprofessional Behavior & extension of leaves",
        "To": "Pushpamupadhyay Bavdhan <pushpamupadhyay.bavdhan@shapoorji.com>",
        "From": "\"ronak.maheshwari\" <ronak.maheshwari@shapoorji.com>",
        "Cc": "\"shoubhik.bhattacharya\" <shoubhik.bhattacharya@shapoorji.com>, \"pritesh.jain\" <pritesh.jain@shapoorji.com>",
        "Bcc": "",
        "Date": "Sun, 31 Mar 2024 08:03:25 +0000",
        "Attachment_Count": 0,
        "Mail_Body": "Hi Pushpam, Hope you\u2019re doing well ! As you were on planned leave from 25th to 31st of March 2024 for 7 days which was pre-approved by us. You leaves was well planned by you & approved in advance verbally by Shoubhik back in February & in March by me & Pritesh Sir. During these days you have been unapproachable for some important office related update. Even to CPs also, this is very much unpr

{'analyze': {'analysis': {'credibility_and_reliability': {'events_analysis': [{'event': 'Complaints about unresponsiveness and aggressive behavior towards colleagues.',
      'credibility_score': 75,
      'reasoning': 'The event is supported by multiple complaints from colleagues, indicating a pattern of behavior.',
      'uid': 'Unprofessional Behavior & extension of leaves.eml'},
     {'event': 'Pushpam has been reported for shouting at a senior and not following team directions.',
      'credibility_score': 80,
      'reasoning': 'This incident involves a senior colleague, which adds weight to the seriousness of the behavior.',
      'uid': 'Behavioral and Disciplinary Concerns- Pushpam Updhyay.eml'},
      'credibility_score': 85,
      'uid': 'Unprofessional Behavior - Pushapm U.eml'}],
    'relationships_analysis': [{'entity1': 'Pushpamupadhyay Bavdhan',
      'entity2': 'Ronak Maheshwari',
      'relationship': "Colleague who sent complaints about Pushpam's behavior.",
      'c

In [18]:
x.system_prompt

SystemMessage(content='You are an advanced AI agent designed for forensic and compliance investigations, specializing in analyzing large email datasets. Your task is to investigate multiple accusations simultaneously, searching for evidence, extracting relevant information, and drawing conclusions. You have access to a SemanticHybridSearch tool that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.\n\nKey Responsibilities:\n- Generate and refine search queries for multiple accusations, providing both Elasticsearch queries and semantic search strings.\n- Analyze search results to extract relevant information.\n- Evaluate evidence to determine if it supports or refutes accusations.\n- Generate conclusions based on the accumulated evidence.\n\nGuidelines:\n- Maintain objectivity and avoid bias in your analysis.\n- Consider the context and relationships between different pieces of information.\n- Be thorough in your investigation, but also efficient