In [38]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200/")

if es.ping():
    print("Connected to Elasticsearch!")
    try:
        response = es.indices.delete(index="emails")
        print(f"Successfully deleted index: emails")
    except Exception as e:
        print(f"Error deleting index: {e}")
else:
    print("Connection failed.")

Connected to Elasticsearch!
Successfully deleted index: emails


In [39]:
# Sample email dataset
emails = pd.read_csv("./data/sample.csv")
emails_formatted = emails.to_dict(orient='records') # Array of Dict
emails_formatted

[{'Subject': 'Project Update',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-02',
  'Body': "Hi Bob, I wanted to give you a quick update on our current project. Things are progressing well, and we're nearing the final stages of development. I think you'll be impressed with what we have in store. Let's catch up soon. Best, Alice"},
 {'Subject': 'Re: Project Update',
  'From': 'bob@orgB.com',
  'To': 'alice@orgA.com',
  'Date': '2024-01-03',
  'Body': "Hey Alice, Thanks for the update! I'm really looking forward to seeing the final product. Let's schedule a meeting next week to discuss potential collaborations. Cheers, Bob"},
 {'Subject': 'Confidential: Design Specs',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-04',
  'Body': "Bob, I've attached some preliminary design specs for the upcoming project. Please keep this under wraps as we are still finalizing details internally. Your insights would be invaluable, but let's keep this between

In [40]:
len(emails_formatted)

27

In [41]:
def strip_email(email):
    for key in email:
        if isinstance(email[key], str):
            email[key] = email[key].strip()
    return email

In [42]:
# Index emails into Elasticsearch
def index_emails(emails):
   for idx, email in enumerate(emails):
      try:
         response = es.index(index="emails", id=idx+1, body=strip_email(email))
         print(f"Indexed document ID {response['_id']} with response: {response['result']}")
      except Exception as e:
         print("Error on", email)
         raise ValueError(e)

index_emails(emails_formatted)

Indexed document ID 1 with response: created
Indexed document ID 2 with response: created
Indexed document ID 3 with response: created
Indexed document ID 4 with response: created
Indexed document ID 5 with response: created
Indexed document ID 6 with response: created
Indexed document ID 7 with response: created
Indexed document ID 8 with response: created
Indexed document ID 9 with response: created
Indexed document ID 10 with response: created
Indexed document ID 11 with response: created
Indexed document ID 12 with response: created
Indexed document ID 13 with response: created
Indexed document ID 14 with response: created
Indexed document ID 15 with response: created
Indexed document ID 16 with response: created
Indexed document ID 17 with response: created
Indexed document ID 18 with response: created
Indexed document ID 19 with response: created
Indexed document ID 20 with response: created
Indexed document ID 21 with response: created
Indexed document ID 22 with response: creat

In [43]:
import json
# Optional: Verify indexed documents
search_response = es.search(index="emails", body={"query": {"match_all": {}}})
print("Indexed documents:")
for hit in search_response['hits']['hits']:
    print(json.dumps(hit["_source"], indent=2))

Indexed documents:
{
  "Subject": "Project Update",
  "From": "alice@orgA.com",
  "To": "bob@orgB.com",
  "Date": "2024-01-02",
  "Body": "Hi Bob, I wanted to give you a quick update on our current project. Things are progressing well, and we're nearing the final stages of development. I think you'll be impressed with what we have in store. Let's catch up soon. Best, Alice"
}
{
  "Subject": "Re: Project Update",
  "From": "bob@orgB.com",
  "To": "alice@orgA.com",
  "Date": "2024-01-03",
  "Body": "Hey Alice, Thanks for the update! I'm really looking forward to seeing the final product. Let's schedule a meeting next week to discuss potential collaborations. Cheers, Bob"
}
{
  "Subject": "Confidential: Design Specs",
  "From": "alice@orgA.com",
  "To": "bob@orgB.com",
  "Date": "2024-01-04",
  "Body": "Bob, I've attached some preliminary design specs for the upcoming project. Please keep this under wraps as we are still finalizing details internally. Your insights would be invaluable, bu

In [44]:
search_query = {
    "query": {
        "match": {
            "Body": "alice"  # Searching for 'confidential' in the Body field
        }
    }
}

# Perform the search
response = es.search(index="emails", body=search_query)

In [45]:
len(response["hits"]["hits"])

10

In [46]:
print("Search Results:")
for hit in response['hits']['hits']:
    print(f"ID: {hit['_id']}, Source: {hit['_source']}")

Search Results:
ID: 14, Source: {'Subject': 'Re: Market Analysis Request', 'From': 'bob@orgB.com', 'To': 'alice@orgA.com', 'Date': '2024-01-15', 'Body': "Hi Alice, Absolutely! I'll send it over shortly. It contains some sensitive information, so please handle it with care. Best, Bob"}
ID: 22, Source: {'Subject': 'Re: Urgent: Need Your Thoughts', 'From': 'bob@orgB.com', 'To': 'alice@orgA.com', 'Date': '2024-01-23', 'Body': "Absolutely, Alice! Let's discuss this in detail later today; it's crucial that we approach this carefully given the stakes involved. bob"}
ID: 18, Source: {'Subject': 'Re: Confidential Insights Needed', 'From': 'bob@orgB.com', 'To': 'alice@orgA.com', 'Date': '2024-01-19', 'Body': "Of course, Alice! I'll compile some thoughts and send them your way shortly. Let's ensure we maintain discretion as always. Cheers, bob"}
ID: 26, Source: {'Subject': 'Re: Final Review Needed Before Launch', 'From': 'bob@orgB.com', 'To': 'alice@orgA.com', 'Date': '2024-01-27', 'Body': "Of co