# Setting Up Elasticsearch

In [None]:
!pip install elasticsearch

## Installation

In [None]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.16.1-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-8.16.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-8.16.1

## Starting Service

In [None]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-8.16.1/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
import time
time.sleep(30)

In [None]:
!ps -ef | grep elasticsearch

## Elasticsearch YML

### Modify YML to turn off Safe Mode

In [None]:
new_yml = """# ======================== Elasticsearch Configuration =========================
#
# NOTE: Elasticsearch comes with reasonable defaults for most settings.
#       Before you set out to tweak and tune the configuration, make sure you
#       understand what are you trying to accomplish and the consequences.
#
# The primary way of configuring a node is via this file. This template lists
# the most important settings you may want to configure for a production cluster.
#
# Please consult the documentation for further information on configuration options:
# https://www.elastic.co/guide/en/elasticsearch/reference/index.html
#
# ---------------------------------- Cluster -----------------------------------
#
# Use a descriptive name for your cluster:
#
#cluster.name: my-application
#
# ------------------------------------ Node ------------------------------------
#
# Use a descriptive name for the node:
#
#node.name: node-1
#
# Add custom attributes to the node:
#
#node.attr.rack: r1
#
# ----------------------------------- Paths ------------------------------------
#
# Path to directory where to store the data (separate multiple locations by comma):
#
#path.data: /path/to/data
#
# Path to log files:
#
#path.logs: /path/to/logs
#
# ----------------------------------- Memory -----------------------------------
#
# Lock the memory on startup:
#
#bootstrap.memory_lock: true
#
# Make sure that the heap size is set to about half the memory available
# on the system and that the owner of the process is allowed to use this
# limit.
#
# Elasticsearch performs poorly when the system is swapping the memory.
#
# ---------------------------------- Network -----------------------------------
#
# By default Elasticsearch is only accessible on localhost. Set a different
# address here to expose this node on the network:
#
#network.host: 192.168.0.1
#
# By default Elasticsearch listens for HTTP traffic on the first free port it
# finds starting at 9200. Set a specific HTTP port here:
#
#http.port: 9200
#
# For more information, consult the network module documentation.
#
# --------------------------------- Discovery ----------------------------------
#
# Pass an initial list of hosts to perform discovery when this node is started:
# The default list of hosts is ["127.0.0.1", "[::1]"]
#
#discovery.seed_hosts: ["host1", "host2"]
#
# Bootstrap the cluster using an initial set of master-eligible nodes:
#
#cluster.initial_master_nodes: ["node-1", "node-2"]
#
# For more information, consult the discovery and cluster formation module documentation.
#
# ---------------------------------- Various -----------------------------------
#
# Allow wildcard deletion of indices:
#
#action.destructive_requires_name: false

#----------------------- BEGIN SECURITY AUTO CONFIGURATION -----------------------
#
# The following settings, TLS certificates, and keys have been automatically      
# generated to configure Elasticsearch security features on 11-12-2024 05:30:04
#
# --------------------------------------------------------------------------------

# Enable security features
xpack.security.enabled: false

xpack.security.enrollment.enabled: false

# Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents
xpack.security.http.ssl:
  enabled: false
  keystore.path: certs/http.p12

# Enable encryption and mutual authentication between cluster nodes
xpack.security.transport.ssl:
  enabled: false
  verification_mode: certificate
  keystore.path: certs/transport.p12
  truststore.path: certs/transport.p12
# Create a new cluster with the current node only
# Additional nodes can still join the cluster later
cluster.initial_master_nodes: ["63410036aadd"]

# Allow HTTP API connections from anywhere
# Connections are encrypted and require user authentication
http.host: 0.0.0.0

# Allow other nodes to join the cluster from anywhere
# Connections are encrypted and mutually authenticated
#transport.host: 0.0.0.0

#----------------------- END SECURITY AUTO CONFIGURATION -------------------------"""

old_yml = open("/kaggle/working/elasticsearch-8.16.1/config/elasticsearch.yml", "w")
old_yml.write(new_yml)
old_yml.close()
!tail -n 25 "/kaggle/working/elasticsearch-8.16.1/config/elasticsearch.yml" | tac

In [None]:
!tail -n 25 "/kaggle/working/elasticsearch-8.16.1/config/elasticsearch.yml" | tac

## Restart Process After Modification

In [None]:
es_server.terminate()
# es_server.wait()  # Wait for the process to fully terminate

In [None]:
es_server = Popen(['elasticsearch-8.16.1/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
time.sleep(30)

In [None]:
!ps -ef | grep elasticsearch

In [None]:
!curl localhost:9200 # Elasticsearch Started

# KeyWord Search Section

In [None]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200/")

if es.ping():
    print("Connected to Elasticsearch!")
    try:
        response = es.indices.delete(index="emails")
        print(f"Successfully deleted index: emails")
    except Exception as e:
        print(f"Error deleting index/Index not found: {e}")
else:
    print("Connection failed.")

In [None]:
emails = pd.read_csv("/kaggle/input/esights-sample-1/set1_better.csv").fillna("") # Index cannot parse nan
email_dict = emails.to_dict(orient='records')
print(email_dict[0].keys(), len(email_dict))

In [None]:
def strip_email(email):
    """To strip trailing spaces that may break the indexing"""
    for key in email:
        if isinstance(email[key], str):
            email[key] = email[key].strip()
    return email

## Creating Search Index

In [None]:
# Index emails into Elasticsearch
def index_emails(emails):
   for idx, email in enumerate(emails):
      try:
         response = es.index(index="emails", id=idx+1, body=strip_email(email))
         print(f"Indexed document ID {response['_id']} with response: {response['result']}")
      except Exception as e:
         print("Error on", email)
         raise ValueError(e)

index_emails(email_dict)

In [None]:
search_query ={
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "Mail_Body": {
              "query": "Pushpam isn't in office today",
              "fuzziness": "AUTO"
            }
          },
        }
      ]
    }
  }
} # Fuzzy search

# Perform the search
response = es.search(index="emails", body=search_query)

In [None]:
len(response["hits"]["hits"])

In [None]:
top_3 = [response["_source"] for response in response["hits"]["hits"][:3]]
top_3

## Save Index

In [None]:
from elasticsearch.helpers import scan
import json

index_name = "emails"
results = es.search(index=index_name, body={"query": {"match_all": {}}, "size": 10000})['hits']['hits']

with open("index_set_1_elastic.json", 'w') as f:
        json.dump(results, f)

## Load Index

In [None]:
from elasticsearch.helpers import bulk

with open("/kaggle/working/index_set_1_elastic.json") as f:
    documents = json.load(f)

    actions = [
        {
            "_index": index_name, # index name
            "_id": doc['_id'],
            "_source": doc['_source']
        }
        for doc in documents
    ]
    bulk(es, actions)

# Semantic Search

In [None]:
!pip install faiss-cpu sentence-transformers

In [None]:
email_docs = [str(email).replace("\'", '"').replace('""', '"') for email in email_dict]
email_docs[:3]

## Generating Embeddings

In [None]:
import faiss
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True, device="cuda")

In [None]:
email_embeddings = model.encode(email_docs)

In [None]:
print("Documents", len(email_embeddings), "Dimensions", len(email_embeddings[0]))

## Creating Vectorstore Index

In [None]:
dimension = email_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity search

index.add(email_embeddings)

In [None]:
query = "Employee bad conduct"
query_embedding = model.encode([query]).astype('float32')

k = 3  # Number of nearest neighbors to retrieve
D, I = index.search(query_embedding, k)  # D: distances, I: indices of nearest neighbors

# Retrieve results based on indices
results = [email_dict[i] for i in I[0]]
results

# Hybridizing Results

In [None]:
keyword_df = pd.DataFrame(top_3)
keyword_df

In [None]:
semantic_df = pd.DataFrame(results)
semantic_df

In [None]:
semantic_df.keys()

In [None]:
hybrid_df = pd.concat([semantic_df, keyword_df], ignore_index=True).drop_duplicates()
hybrid_df

## Handling Email Chains

In [None]:
import re
from difflib import SequenceMatcher
from typing import List, Dict, Any

def clean_text(text: str) -> str:
    """
    Remove extra whitespace and newlines from the given text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with extra whitespace removed.
    """
    return re.sub(r'\s+', ' ', text).strip()

def find_overlap(text1: str, text2: str) -> str:
    """
    Find the longest common substring between two texts.

    Args:
        text1 (str): The first text to compare.
        text2 (str): The second text to compare.

    Returns:
        str: The longest common substring, or an empty string if no overlap is found.
    """
    matcher = SequenceMatcher(None, text1, text2)
    match = matcher.find_longest_match(0, len(text1), 0, len(text2))
    return text1[match.a: match.a + match.size] if match.size > 0 else ""

def extract_unique_content(emails: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Extract unique content from a list of email dictionaries by removing overlapping text.

    This function processes a list of email dictionaries, removing any overlapping content
    between emails to reduce redundancy. It preserves the original email structure and
    metadata while modifying only the 'Mail_Body' field.

    Args:
        emails (List[Dict[str, Any]]): A list of dictionaries, each representing an email
                                       with keys for 'Origin', 'Subject', 'To', 'From', 'Cc',
                                       'Bcc', 'Date', 'Attachment_Count', and 'Mail_Body'.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries with the same structure as the input,
                              but with overlapping content removed from the 'Mail_Body' field.

    Note:
        This function assumes that emails are ordered chronologically, with newer emails
        appearing later in the list.
    """
    unique_contents = []
    
    for i, email in enumerate(emails):
        current_email = clean_text(email['Mail_Body'])
        unique_content = current_email

        for j in range(i):
            previous_email = clean_text(emails[j]['Mail_Body'])
            overlap = find_overlap(previous_email, current_email)
            
            if len(overlap) > 10:
                unique_content = unique_content.replace(overlap, "").strip()

        unique_contents.append({
            'Origin': email['Origin'],
            'Subject': email['Subject'],
            'To': email['To'],
            'From': email['From'],
            'Cc': email['Cc'],
            'Bcc': email['Bcc'],
            'Date': email['Date'],
            'Attachment_Count': email['Attachment_Count'],
            'Mail_Body': unique_content
        })

    return unique_contents

In [None]:
print(len(str(extract_unique_content(hybrid_df.to_dict(orient="records")))), len(str(hybrid_df.to_dict(orient="records"))))

In [None]:
hybrid_df_cleaned = extract_unique_content(hybrid_df.to_dict(orient="records"))
hybrid_df_cleaned