In [26]:
import pandas as pd
import email
from email import policy
from email.parser import BytesParser
import re
from bs4 import BeautifulSoup

In [27]:
test = "output_pst_file\\Outlook Data File\\Inbox\\31.eml"

In [28]:
def parse_email_file(filepath):
    """Parse a single email file and return a dictionary of data"""
    with open(test, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    # Extract basic headers
    data = {
        'From': msg['From'],
        'To': msg['To'],
        'Subject': msg['Subject'],
        'Date': msg['Date'],
        'Message-ID': msg['Message-ID'],
        'In-Reply-To': msg['In-Reply-To'],
        'References': msg['References'],
        'Thread-Topic': msg['Thread-Topic']
    }
    
    # Extract HTML body
    body_html = None
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/html':
                body_html = part.get_content()
                break
    else:
        if msg.get_content_type() == 'text/html':
            body_html = msg.get_content()
    
    # Parse email body
    if body_html:
        soup = BeautifulSoup(body_html, 'html.parser')
        
        # Get clean text (remove excessive whitespace)
        text_content = soup.get_text(separator='\n', strip=True)
        text_content = re.sub(r'\n\s*\n', '\n\n', text_content)
        
        data['Body_Text'] = text_content
        
        # Extract sender info from signature if present
        # Look for email addresses in the body
        emails_in_body = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text_content)
        data['Emails_Mentioned'] = ', '.join(set(emails_in_body)) if emails_in_body else None
        
    return data

# Parse the email
email_data = parse_email_file('33.eml')
df = pd.DataFrame([email_data])

# Display results
print("Email Headers:")
print(f"From: {df['From'].iloc[0]}")
print(f"To: {df['To'].iloc[0]}")
print(f"Subject: {df['Subject'].iloc[0]}")
print(f"Date: {df['Date'].iloc[0]}")
print(f"\nIs Reply: {df['In-Reply-To'].iloc[0] is not None}")
print(f"\nBody Preview:\n{df['Body_Text'].iloc[0][:500]}...")

# Show full dataframe
print("\n" + "="*80)
print(df.T)  # Transpose for better viewing

Email Headers:
From: PGR Pro <pgrprodoctoralschool@aru.ac.uk>
To: pst118@pgr.aru.ac.uk
Subject: Booking confirmation for SE41: Advanced MS Word Training for PGRs - FSE Only
Date: Fri, 26 Sep 2025 05:10:00 +0000

Is Reply: False

Body Preview:
PGR Pro
Dear PATRICK
You are now successfully booked on the following event:
SE41: Advanced MS Word Training for PGRs - FSE Only
Please make sure you can attend all of the sessions listed below:
Session 1
Start date: 22-Oct-2025 14:00
End Date: 22-Oct-2025 15:00
Venue: Online course, Online

Directions: Details of how to connect to your training will be emailed to you shortly before your course.
Facilitator(s): Alex Mann, Andrea Packwood
--------------------------------------------
Etiquette: Ar...

                                                                  0
From                       PGR Pro <pgrprodoctoralschool@aru.ac.uk>
To                                             pst118@pgr.aru.ac.uk
Subject           Booking confirmation for SE41: