First Step: Indexing our Mail DataSet

In [1]:
from elasticsearch import Elasticsearch

from nltk.tokenize import word_tokenize
import string

In [None]:
# Connect to Elasticsearch
#host address with port, 
#ca_certs: path to the certificate,
#basic_auth: username and password
es = Elasticsearch(
    "",
    ca_certs="",
    basic_auth=("", "")
)

In [None]:
# Index name
index_name = ''

mapping = {
    "mappings": {
        "properties": {
            "subject": { "type": "text"},
            "from": { "type": "text" },
            "date": { "type": "text" },
            "to": {  "type": "text" },
            "cc": { "type": "text" },
            "reply-to": { "type": "text" },
            "body": { "type": "text" }
        }
    }
}

# Create index
es.indices.create(index=index_name, body=mapping)

In [4]:
import os

# Path to the dataset folder with all the emails in txt format
dataset_path = ''

for file in os.listdir(dataset_path):
    try:
        with open(os.path.join(dataset_path, file), 'r', encoding="utf-8") as f:
            lines = f.readlines()
            email = {}
            for line in lines:
                if line.startswith('Subject:' or 'SUBJECT:'):
                    email['subject'] = line[9:]
                elif line.startswith('From:' or 'FROM:'):
                    email['from'] = line[6:]
                elif line.startswith('Date' or 'DATE:'):
                    email['date'] = line[6:]
                elif line.startswith('To:' or 'TO:'):
                    email['to'] = line[4:]
                elif line.startswith('Cc:' or 'CC:'):
                    email['cc'] = line[4:]
                elif line.startswith('Reply-to:' or 'REPLY-TO:'):
                    email['reply-to'] = line[10:]
                else:
                    break
            #the rest of the file is the body
            email['body'] = ''.join(lines[6:])
            
            body = {}
            if('subject' in email.keys()):
                body['subject'] = email['subject']
            if('from' in email):
                body['from'] = email['from']
            if('date' in email):
                body['date'] = email['date']
            if('to' in email):
                body['to'] = email['to']
            if('cc' in email):
                body['cc'] = email['cc']
            if('reply-to' in email):
                body['reply-to'] = email['reply-to']
            if('body' in email):
                body['body'] = email['body']

            # Insert document
            es.index(index=index_name, body=body)
    except Exception as e:
        continue

Now that the indexing is complete, lets do some search query

In [5]:
#wrtie the query here
query = {
    "query": {
        "match": {
            "body": ""
        }
    }
}

# Execute the query
results = es.search(index=index_name, body=query, size=10)  # Adjust the size as needed

#print the total number of hits
print("Total Hits:", results['hits']['total']['value'])

#print the document id of top 10 hits along with the line where the query was found
print("\nDocument IDs and content of top 10 hits:")
for hit in results['hits']['hits']:
    print(hit['_id'])
    print(hit['_source']['body'])
    print("\n")

Total Hits: 480

Document IDs and content of top 10 hits:
V-tafYsBXhn-9pfK8ENu
MA201_2022 ->Assignment ->Tutorial 7

You have submitted an assignment submission for 'Tutorial 7'.

You can see the status of your assignment submission.




petbfYsBXhn-9pfKG0iJ
CS/IT 333_2023 ->Assignment ->1

You have submitted an assignment submission for '1'.

You can see the status of your assignment submission.




M-tafYsBXhn-9pfK70Nc
MA201_2022 ->Assignment ->4. Joint Distribution

You have submitted an assignment submission for '4. Joint Distribution'.

You can see the status of your assignment submission.




NOtafYsBXhn-9pfK70Nk
MA201_2022 ->Assignment ->5. Covariance matrix

You have submitted an assignment submission for '5. Covariance matrix'.

You can see the status of your assignment submission.




WetafYsBXhn-9pfK6EKG
MA201_2022 ->Assignment ->3. Special Random variables

You have submitted an assignment submission for '3. Special Random variables'.

You can see the status of your assignm

Next, lets create a bigram and trigram model


In [6]:
from elasticsearch.helpers import scan
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import random

In [7]:
bigram_model = {}
trigram_model = {}

# Function to generate n-grams
def generate_ngrams(tokens, n):
    n_grams = ngrams(tokens, n)
    return list(n_grams)

# Tokenize and generate n-grams from email text
def process_email(email_text):
    tokens = word_tokenize(email_text)
    bigrams = generate_ngrams(tokens, 2)
    trigrams = generate_ngrams(tokens, 3)
    return tokens, bigrams, trigrams

In [8]:
sender_name = ""  # Change to your desired sender name

# Retrieve email text from Elasticsearch and build the models
query = {
    "query": {
        "match": {
            'from': sender_name
        }
    }
}

In [9]:
# Use the scroll API to retrieve all emails
emails = scan(es, index="email_dataset", query=query)

for email in emails:
    email_text = email["_source"]["body"]
    tokens, bigrams, trigrams = process_email(email_text)
    
    # Update the language models with bigrams and trigrams
    for bigram in bigrams:
        prefix, suffix = bigram
        if prefix not in bigram_model:
            bigram_model[prefix] = []
        bigram_model[prefix].append(suffix)
    
    for trigram in trigrams:
        prefix, suffix = trigram[:2], trigram[2]
        if prefix not in trigram_model:
            trigram_model[prefix] = []
        trigram_model[prefix].append(suffix)

In [10]:
# Function to generate text using bigram model
def generate_bigram_text(bigram_model, seed_word, max_length=20):
    text = [seed_word]
    current_word = seed_word
    for _ in range(max_length - 1):
        if current_word not in bigram_model:
            break
        next_word = random.choice(bigram_model[current_word])
        text.append(next_word)
        current_word = next_word
    return " ".join(text)

# Function to generate text using trigram model

def generate_trigram_text(trigram_model, seed_prefix, max_length=20):
    text = list(seed_prefix)
    current_prefix = tuple(seed_prefix)  # Convert the prefix to a tuple
    for _ in range(max_length - len(seed_prefix)):
        if current_prefix not in trigram_model:
            break
        next_word = random.choice(trigram_model[current_prefix])
        text.append(next_word)
        current_prefix = tuple(text[-2:])  # Update current_prefix as a tuple
    return " ".join(text)


In [None]:
print("\nSender Name:", sender_name)  # Print the sender name

# Generate random text for a seed word
seed_word = ""  # Change to your desired seed word
max_length = 10  # Adjust the maximum length of the generated text

# Generate bigram text for the seed word
generated_bigram_text = generate_bigram_text(bigram_model, seed_word, max_length)
print("\nSeed Word:", seed_word)
print("Text Generated by Bigram Model:")
print(generated_bigram_text)

# Generate random text for a seed prefix
seed_prefix = ("", "")  # Change to your desired seed prefix
max_length = 20  # Adjust the maximum length of the generated text

# Generate trigram text for the seed prefix
generated_trigram_text = generate_trigram_text(trigram_model, seed_prefix, max_length)
print("\nSeed Prefix:", seed_prefix)
print("Text Generated by Trigram Model:")
print(generated_trigram_text)