# Boilerplate

In [None]:
!pip install elasticsearch
!pip install faiss-cpu sentence-transformers

## Initialize Elastic Search

In [None]:
import time

In [None]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.16.1-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-8.16.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-8.16.1

In [None]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-8.16.1/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
!ps -ef | grep elasticsearch

In [None]:
time.sleep(30)

In [None]:
new_yml = """# ======================== Elasticsearch Configuration =========================
#
# NOTE: Elasticsearch comes with reasonable defaults for most settings.
#       Before you set out to tweak and tune the configuration, make sure you
#       understand what are you trying to accomplish and the consequences.
#
# The primary way of configuring a node is via this file. This template lists
# the most important settings you may want to configure for a production cluster.
#
# Please consult the documentation for further information on configuration options:
# https://www.elastic.co/guide/en/elasticsearch/reference/index.html
#
# ---------------------------------- Cluster -----------------------------------
#
# Use a descriptive name for your cluster:
#
#cluster.name: my-application
#
# ------------------------------------ Node ------------------------------------
#
# Use a descriptive name for the node:
#
#node.name: node-1
#
# Add custom attributes to the node:
#
#node.attr.rack: r1
#
# ----------------------------------- Paths ------------------------------------
#
# Path to directory where to store the data (separate multiple locations by comma):
#
#path.data: /path/to/data
#
# Path to log files:
#
#path.logs: /path/to/logs
#
# ----------------------------------- Memory -----------------------------------
#
# Lock the memory on startup:
#
#bootstrap.memory_lock: true
#
# Make sure that the heap size is set to about half the memory available
# on the system and that the owner of the process is allowed to use this
# limit.
#
# Elasticsearch performs poorly when the system is swapping the memory.
#
# ---------------------------------- Network -----------------------------------
#
# By default Elasticsearch is only accessible on localhost. Set a different
# address here to expose this node on the network:
#
#network.host: 192.168.0.1
#
# By default Elasticsearch listens for HTTP traffic on the first free port it
# finds starting at 9200. Set a specific HTTP port here:
#
#http.port: 9200
#
# For more information, consult the network module documentation.
#
# --------------------------------- Discovery ----------------------------------
#
# Pass an initial list of hosts to perform discovery when this node is started:
# The default list of hosts is ["127.0.0.1", "[::1]"]
#
#discovery.seed_hosts: ["host1", "host2"]
#
# Bootstrap the cluster using an initial set of master-eligible nodes:
#
#cluster.initial_master_nodes: ["node-1", "node-2"]
#
# For more information, consult the discovery and cluster formation module documentation.
#
# ---------------------------------- Various -----------------------------------
#
# Allow wildcard deletion of indices:
#
#action.destructive_requires_name: false

#----------------------- BEGIN SECURITY AUTO CONFIGURATION -----------------------
#
# The following settings, TLS certificates, and keys have been automatically      
# generated to configure Elasticsearch security features on 11-12-2024 05:30:04
#
# --------------------------------------------------------------------------------

# Enable security features
xpack.security.enabled: false

xpack.security.enrollment.enabled: false

# Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents
xpack.security.http.ssl:
  enabled: false
  keystore.path: certs/http.p12

# Enable encryption and mutual authentication between cluster nodes
xpack.security.transport.ssl:
  enabled: false
  verification_mode: certificate
  keystore.path: certs/transport.p12
  truststore.path: certs/transport.p12
# Create a new cluster with the current node only
# Additional nodes can still join the cluster later
cluster.initial_master_nodes: ["63410036aadd"]

# Allow HTTP API connections from anywhere
# Connections are encrypted and require user authentication
http.host: 0.0.0.0

# Allow other nodes to join the cluster from anywhere
# Connections are encrypted and mutually authenticated
#transport.host: 0.0.0.0

#----------------------- END SECURITY AUTO CONFIGURATION -------------------------"""

old_yml = open("/kaggle/working/elasticsearch-8.16.1/config/elasticsearch.yml", "w")
old_yml.write(new_yml)
old_yml.close()
!tail -n 25 "/kaggle/working/elasticsearch-8.16.1/config/elasticsearch.yml" | tac

In [None]:
es_server.terminate()

In [None]:
!kill -15 301

In [None]:
!ps -ef | grep elasticsearch

In [None]:
es_server = Popen(['elasticsearch-8.16.1/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200/")

if es.ping():
    print("Connected to Elasticsearch!")
    try:
        response = es.indices.delete(index="emails")
        print(f"Successfully deleted index: emails")
    except Exception as e:
        print(f"Error deleting index/Index not found: {e}")
else:
    print("Connection failed.")

# Main Section

In [None]:
import faiss
import json
from elasticsearch.helpers import bulk
from difflib import SequenceMatcher
from typing import List, Dict, Any
import os
import re

class SemanticHybridSearch:
    """
    A class that combines Elasticsearch for keyword-based lexical searches and Faiss for semantic searches.

    This class provides methods to load and search both Elasticsearch and Faiss indices,
    as well as a hybrid search method that combines results from both search types.

    Attributes:
        data (list): The dataset used for searching.
        es_client (Elasticsearch): Elasticsearch client for performing lexical searches.
        embedding_model: Model used for encoding queries into embeddings.
        vector_index (faiss.Index): Faiss index for semantic searches.
        elastic_index_name (str): Name of the Elasticsearch index.
    """
    def __init__(self, es_client, embedding_model, data: list, elastic_index_path: str, vector_index_path: str):
        """
        Initialize the SemanticHybridSearch class.

        Args:
            es_client (Elasticsearch): Elasticsearch client.
            embedding_model: Model for encoding queries into embeddings.
            data (list): Dataset used for searching.
            elastic_index_path (str): Path to the Elasticsearch index file.
            vector_index_path (str): Path to the Faiss vector index file.
        """
        self.data = data
        self.es_client = es_client
        self.embedding_model = embedding_model
        self.vector_index = self.load_vector_index(vector_index_path)
        self.elastic_index = self.load_elastic_index(elastic_index_path)

        self.elastic_index_name = ""
    
    def load_elastic_index(self, elastic_index_path: str):
        """
        Load the Elasticsearch index from a file.

        Args:
            elastic_index_path (str): Path to the Elasticsearch index file.
        """
        with open(elastic_index_path) as f:
            documents = json.load(f)
            self.elastic_index_name = os.path.basename(elastic_index_path)
            print(f"Loading Index {self.elastic_index_name}")

            actions = [
                {
                    "_index": self.elastic_index_name,
                    "_id": doc["_id"],
                    "_source": doc["_source"]
                }
                for doc in documents
            ]
            bulk(self.es_client, actions)

    def load_vector_index(self, vector_index_path: str):
        """
        Load the Faiss vector index from a file.

        Args:
            vector_index_path (str): Path to the Faiss vector index file.

        Returns:
            faiss.Index: Loaded Faiss index.
        """
        print(f"Loading Index {os.path.basename(vector_index_path)}")
        index = faiss.read_index(vector_index_path)
        return index

    def elastic_search(self, query: dict, top_k: int=3) -> list:
        """
        Perform a keyword-based search using Elasticsearch.

        Args:
            query (dict): Elasticsearch query.
            top_k (int): Number of top results to return. Defaults to 3.

        Returns:
            list: Top k search results.
        """
        results = self.es_client.search(index=self.elastic_index_name, body=query)
        return [result["_source"] for result in results["hits"]["hits"][:top_k]]
    
    def semantic_search(self, query: str, top_k: int=3) -> list:
        """
        Perform a semantic search using Faiss.

        Args:
            query (str): Search query.
            top_k (int): Number of top results to return. Defaults to 3.

        Returns:
            list: Top k search results.
        """
        embedding = self.embedding_model.encode([query]).astype('float32')
        distances, idx = self.vector_index.search(embedding, top_k)
        results = [self.data[i] for i in idx[0]]

        return results

    def hybrid_search(self, elastic_query: dict, semantic_query: str, top_k: tuple=(3,3), clean_overlap: bool=True) -> list:
        """
        Perform a hybrid search combining results from Elasticsearch and Faiss.

        Args:
            elastic_query (dict): Elasticsearch query for lexical search.
            semantic_query (str): Query string for semantic search.
            top_k (tuple): Tuple containing the number of top results to return for (elastic, semantic) searches. Defaults to 3.
            clean_overlap (bool): Whether to remove overlap in email threads results. Defaults to True.

        Returns:
            list: Combined and deduplicated search results.
        """
        elastic_results = self.elastic_search(elastic_query, top_k[0])
        semantic_results = self.semantic_search(semantic_query, top_k[1])

        hybrid_concat = pd.concat([pd.DataFrame(elastic_results), pd.DataFrame(semantic_results)], ignore_index=True).drop_duplicates()
        hybrid_results = hybrid_concat.to_dict(orient="records")

        if clean_overlap:
            return self._extract_unique_content(hybrid_results)
        return hybrid_results
        

    def _clean_text(self, text: str) -> str:
        """
        Remove extra whitespace and newlines from the given text.
    
        Args:
            text (str): The input text to be cleaned.
    
        Returns:
            str: The cleaned text with extra whitespace removed.
        """
        return re.sub(r'\s+', ' ', text).strip()

    def _find_overlap(self, text1: str, text2: str) -> str:
        """
        Find the longest common substring between two texts.
    
        Args:
            text1 (str): The first text to compare.
            text2 (str): The second text to compare.
    
        Returns:
            str: The longest common substring, or an empty string if no overlap is found.
        """
        matcher = SequenceMatcher(None, text1, text2)
        match = matcher.find_longest_match(0, len(text1), 0, len(text2))
        return text1[match.a: match.a + match.size] if match.size > 0 else ""

    def _extract_unique_content(self, emails: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Extract unique content from a list of email dictionaries by removing overlapping text.
        
        This function processes a list of email dictionaries, removing any overlapping content
        between emails to reduce redundancy. It preserves the original email structure and
        metadata while modifying only the 'Mail_Body' field.
    
        Args:
            emails (List[Dict[str, Any]]): A list of dictionaries, each representing an email
            keys for 'Origin', 'Subject', 'To', 'From', 'Cc', 'Bcc', 'Date', 'Attachment_Count', 
            and 'Mail_Body'.
    
        Returns:
            List[Dict[str, Any]]: A list of dictionaries with the same structure as the input,
            but with overlapping content removed from the 'Mail_Body' field.
    
        Note:
            This function assumes that emails are ordered chronologically, with newer emails
            appearing later in the list.
        """
        unique_contents = []
        
        for i, email in enumerate(emails):
            current_email = self._clean_text(email['Mail_Body'])
            unique_content = current_email
    
            for j in range(i):
                previous_email = self._clean_text(emails[j]['Mail_Body'])
                overlap = self._find_overlap(previous_email, current_email)
                
                if len(overlap) > 10:
                    unique_content = unique_content.replace(overlap, "").strip()
    
            unique_contents.append({
                'Origin': email['Origin'],
                'Subject': email['Subject'],
                'To': email['To'],
                'From': email['From'],
                'Cc': email['Cc'],
                'Bcc': email['Bcc'],
                'Date': email['Date'],
                'Attachment_Count': email['Attachment_Count'],
                'Mail_Body': unique_content
            })
    
        return unique_contents

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True, device="cuda")

In [None]:
emails = pd.read_csv("/kaggle/input/esights-sample-interface/set1_better.csv").fillna("") # Index cannot parse nan
email_dict = emails.to_dict(orient='records')
email_dict[0]

In [None]:
search_tool = SemanticHybridSearch(es, model, email_dict, "/kaggle/input/e-sights-index/index_set_1_elastic.json", "/kaggle/input/e-sights-index/index_set_1_semantic.index")

In [None]:
elastic_search_query = {
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "To": {
              "query": "Pushpam",
              "fuzziness": "AUTO"
            }
          },
        }
      ]
    }
  }
} # Fuzzy search
search_tool.elastic_search(elastic_search_query, 3)

In [None]:
search_tool.semantic_search("I want to buy property", 3)

In [None]:
search_tool.hybrid_search(elastic_search_query, "I want to buy property", (3,3))