In [1]:
import os
import email
from email import policy
import pandas as pd
import numpy as np
import re

In [2]:
def extract_url_from_body(body):
    """Extract the first URL from email body, return np.nan if none found"""
    if not body or body.strip() == "":
        return np.nan
    
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    
    match = re.search(url_pattern, body)
    if match:
        return match.group(0)
    else:
        return np.nan

def parse_email_file(file_path):
    """Parse a single email file and extract key information"""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            msg = email.message_from_file(f)
        
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    try:
                        payload = part.get_payload(decode=True)
                        if payload:
                            body += payload.decode('utf-8', errors='ignore')
                    except:
                        pass
        else:
            try:
                payload = msg.get_payload(decode=True)
                if payload:
                    body = payload.decode('utf-8', errors='ignore')
                else:
                    body = str(msg.get_payload())
            except:
                body = str(msg.get_payload())
        
        url = extract_url_from_body(body)
        
        return {
            'file_path': file_path,
            'from': msg.get('From', ''),
            'to': msg.get('To', ''),
            'subject': msg.get('Subject', ''),
            'date': msg.get('Date', ''),
            'cc': msg.get('Cc', ''),
            'bcc': msg.get('Bcc', ''),
            'body': body,
            'url': url
        }
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return None
            

def read_limited_emails(root_dir, limit):
    """Read limited number of emails"""
    emails = []
    count = 0
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if not filename.startswith('.'):
                if count >= limit:
                    print(f"Reached limit of {limit} emails")
                    return emails
                
                file_path = os.path.join(dirpath, filename)
                email_data = parse_email_file(file_path)
                if email_data:
                    emails.append(email_data)
                    count += 1
                    if count % 10000 == 0:
                        print(f"Processed {count} emails...")
    
    return emails

In [3]:
emails_list = read_limited_emails("/kaggle/input/raw-enron-dataset/maildir", limit=1000000)
df = pd.DataFrame(emails_list)

Processed 10000 emails...
Processed 20000 emails...
Processed 30000 emails...
Processed 40000 emails...
Processed 50000 emails...
Processed 60000 emails...
Processed 70000 emails...
Processed 80000 emails...
Processed 90000 emails...
Processed 100000 emails...
Processed 110000 emails...
Processed 120000 emails...
Processed 130000 emails...
Processed 140000 emails...
Processed 150000 emails...
Processed 160000 emails...
Processed 170000 emails...
Processed 180000 emails...
Processed 190000 emails...
Processed 200000 emails...
Processed 210000 emails...
Processed 220000 emails...
Processed 230000 emails...
Processed 240000 emails...
Processed 250000 emails...
Processed 260000 emails...
Processed 270000 emails...
Processed 280000 emails...
Processed 290000 emails...
Processed 300000 emails...
Processed 310000 emails...
Processed 320000 emails...
Processed 330000 emails...
Processed 340000 emails...
Processed 350000 emails...
Processed 360000 emails...
Processed 370000 emails...
Processed 

In [4]:
print(f"\nTotal emails in list: {len(emails_list)}")
print(f"Total emails in dataframe: {len(df)}")
print("\nFirst few rows:")
print(df.head())


Total emails in list: 517401
Total emails in dataframe: 517401

First few rows:
                                           file_path  \
0  /kaggle/input/raw-enron-dataset/maildir/neal-s...   
1  /kaggle/input/raw-enron-dataset/maildir/neal-s...   
2  /kaggle/input/raw-enron-dataset/maildir/neal-s...   
3  /kaggle/input/raw-enron-dataset/maildir/neal-s...   
4  /kaggle/input/raw-enron-dataset/maildir/neal-s...   

                       from  \
0  matt.motsinger@enron.com   
1      scott.neal@enron.com   
2      scott.neal@enron.com   
3      scott.neal@enron.com   
4      scott.neal@enron.com   

                                                  to  \
0  hunter.shively@enron.com, phillip.allen@enron....   
1                                    kmurra1@lsu.edu   
2                          jana.giovannini@enron.com   
3                            oscar_harrison@msdw.com   
4                          john.shoobridge@enron.com   

                                 subject  \
0  Average EOL

In [6]:
summary = pd.DataFrame({
    "index" : range(len(df.columns)),
    "column" : df.columns,
    "length" : len(df),
    "nan_count" : df.isna().sum().values
})

print(summary)


   index     column  length  nan_count
0      0  file_path  517401          0
1      1       from  517401          0
2      2         to  517401          0
3      3    subject  517401          0
4      4       date  517401          0
5      5         cc  517401          0
6      6        bcc  517401          0
7      7       body  517401          0
8      8        url  517401     450273


In [8]:
output_file = "Enron_Emails.csv"
print(f"\nSaving to {output_file}")
df.to_csv("Enron_Emails.csv", index = False , encoding = "utf-8")
print(f"✓ Successfully saved to {output_file}")


Saving to Enron_Emails.csv
✓ Successfully saved to Enron_Emails.csv
