In [1]:
import re
from email import policy
from email.parser import BytesParser

def parse_email(raw_email_bytes):
    """
    Parses real-life raw email bytes and returns a cleaned string (subject + body).
    
    Args:
        raw_email_bytes (bytes): Raw email data from .eml file or email server.
    
    Returns:
        str: Cleaned and combined subject and body text.
    """
    # Parse the raw email bytes
    msg = BytesParser(policy=policy.default).parsebytes(raw_email_bytes)

    subject = msg['subject'] or ''
    body = ''

    # Extract plain text body
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain' and not part.get_content_disposition():
                body = part.get_content()
                break
    else:
        body = msg.get_content()

    # Cleaning function
    def clean_text(text):
        text = re.sub(r'http\S+', '', text)            # Remove URLs
        text = re.sub(r'\s+', ' ', text)               # Normalize whitespace
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)     # Remove special characters
        return text.strip().lower()

    cleaned_subject = clean_text(subject)
    cleaned_body = clean_text(body)

    print(cleaned_subject)
    print(cleaned_body)


In [3]:
sample_email_bytes = b"""\
From: John Doe <john@example.com>
To: Jane Smith <jane@example.com>
Subject: Follow-up on Project Proposal
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="abc123"

--abc123
Content-Type: text/plain; charset="UTF-8"

Hi Jane,

Just checking in regarding the project proposal we discussed last Friday.
Let me know if you need anything from my side.

Thanks,
John

--abc123
Content-Type: text/html; charset="UTF-8"

<html>
  <body>
    <p>Hi Jane,</p>
    <p>Just checking in regarding the project proposal we discussed last Friday.</p>
    <p>Let me know if you need anything from my side.</p>
    <p>Thanks,<br>John</p>
  </body>
</html>

--abc123--
"""
# Example usage
parse_email(sample_email_bytes)

followup on project proposal
hi jane just checking in regarding the project proposal we discussed last friday let me know if you need anything from my side thanks john


In [None]:
    print(f"Cleaned mail: {cleaned_subject} {cleaned_body}")
    return f"{cleaned_subject} {cleaned_body}"