In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import csv

In [2]:
def get_body_from_message(msg):
    """
    Function to extract the body from an email message object.
    Handles plain text and HTML emails.
    """
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                return extract_current_message(part.get_payload(decode=True).decode('utf-8', errors='ignore'))
            elif part.get_content_type() == 'text/html':
                html_body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
                text = BeautifulSoup(html_body, 'html.parser').get_text()
                return extract_current_message(text)
    else:
        if msg.get_content_type() == 'text/plain':
            return extract_current_message(msg.get_payload(decode=True).decode('utf-8', errors='ignore'))
        elif msg.get_content_type() == 'text/html':
            html_body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
            text = BeautifulSoup(html_body, 'html.parser').get_text()
            return extract_current_message(text)

    return None

In [3]:
def extract_current_message(body):
    """
    Extract only the current message by identifying common email reply/forward markers.
    """
    # Splitting the email body by common reply/forward indicators
    reply_markers = [
        "On ",  # Common for replies like "On [date], [name] wrote:"
        "From:",  # Common in forwarded messages
        "Sent:",  # Often found in forwarded messages
        "--- Forwarded message ---",  # Explicit forward marker
        "Original Message"  # Another common forward marker
    ]
    
    # Go through markers and split the email at the first occurrence of any marker
    for marker in reply_markers:
        body = body.split(marker)[0]  # Keep only the portion before the marker
    
    return body.strip()

In [4]:
import mailbox
# Path to your .mbox file (from the extracted Gmail Takeout data)
mbox_path = "../../Downloads/Takeout/Mail/All mail Including Spam and Trash.mbox"

def search_emails():
    mbox = mailbox.mbox(mbox_path)
    msg_csv = pd.DataFrame(columns=['From','To','Date','Subject','Message'])
    from_li = []
    to_li = []
    date_li = []
    subject_li = []
    body_li = []
    for message in mbox:
        # Check if the email body contains the company name
        # if 'priyankaa' in message['From']:
            # print(f"Subject: {message['subject']}")
            # print(f"From: {message['from']}")
            # print(f"Date: {message['date']}")
            
        email_body = get_body_from_message(message)
        if email_body:
            # print("----- Email Body Start -----")
            # print(email_body.strip())  # Strip leading/trailing whitespaces
            # print("------ Email Body End ------\n")
            # print("-" * 50)
            from_li.append(message['from'])
            to_li.append(message['to'])
            date_li.append(message['date'])
            subject_li.append(message['subject'])
            body_li.append(email_body.strip())
                      

            # break
    msg_csv = pd.DataFrame({'From':from_li,
                            'To' : to_li,
                            'Date' : date_li,
                            'Subject':subject_li,
                            'Message':body_li})

    return msg_csv
# Search for emails related to a specific company (e.g., GFN)
df = search_emails()


In [5]:
df

Unnamed: 0,From,To,Date,Subject,Message
0,Capgemini Deutschland <noreply@msg.jobylon.com>,Priyankaa Elangovan <priyankaa.elangovanj@gmai...,"Wed, 11 Sep 2024 10:52:00 +0000",Your application at Capgemini,"Hello Priyankaa,\r\n\r\n\r\n\r\nThank you for ..."
1,EGYM GmbH <jobs@egym.com>,Priyankaa Elangovan <priyankaa.elangovanj@gmai...,"Mon, 9 Sep 2024 06:48:42 -0400",Thank you for your application at EGYM!,"Dear Priyankaa,\r\n\r\nThank you for your appl..."
2,Elizabeth Wasike <entrix-jobs@m.personio.de>,Priyankaa Elangovan <priyankaa.elangovanj@gmai...,"Mon, 9 Sep 2024 12:03:03 +0000",Regarding Your Application at Entrix,"Dear Priyankaa,Thank you very much for your ap..."
3,Ricardo Ghekiere <rghekiere@betterpic.io>,priyankaa.elangovanj@gmail.com,"Wed, 11 Sep 2024 11:44:40 +0000",You disappeared?,Image \r\nSupport ( https://support.betterpic....
4,Ryan Choi <workatastartup@ycombinator.com>,priyankaa.elangovanj@gmail.com,"Tue, 10 Sep 2024 11:12:59 +0000",Hello from Y Combinator's Work at a Startup!,Hi!\r\n\r\nThanks for applying to a YC company...
...,...,...,...,...,...
293,talent@auterion.com,priyankaa.elangovanj@gmail.com,"Wed, 14 Aug 2024 13:02:10 +0000",Thank you for applying to Auterion,"Hello Priyankaa,\r\n\r\nThank you for applying..."
294,=?utf-8?q?=22Rainer_M=C3=BCller=22?=\r\n <noti...,priyankaa.elangovanj@gmail.com,"Thu, 05 Sep 2024 07:38:16 +0000",Thank you for your interest in Korro AI -Senio...,Reply above to continue the conversation with ...
295,LinkedIn <jobs-noreply@linkedin.com>,Priyankaa Elangovan <priyankaa.elangovanj@gmai...,"Thu, 15 Aug 2024 12:43:39 +0000 (UTC)","Priyankaa, your application was sent to Berkel...",Your application was sent to Berkeley Square -...
296,LinkedIn <jobs-noreply@linkedin.com>,Priyankaa Elangovan <priyankaa.elangovanj@gmai...,"Wed, 28 Aug 2024 16:48:56 +0000 (UTC)","=?UTF-8?Q?Priyankaa,_your_application_was_sent...",Your application was sent to SR2 | Socially Re...


In [6]:
df.to_csv('Mails.csv', index=False, quotechar='"', quoting=csv.QUOTE_MINIMAL)
