# Parsing Source Text From Enron Dataset
This notebook extracts the message content from the Enron Dataset. The Eron Dataset can be downloaded and extracted using <a href='gather_enron_dataset.ipynb'>gather_enron_dataset.ipynb</a>

#### Imports

In [1]:
import os
from os import walk
import pandas as pd

### Helper functions

In [2]:
# Create some helper functions
def get_email_meta(file_path):
    try: 
        with open(file_path) as file:

            # Initialize Flags
            is_reply_forward = False
            header = True
    
            # Initialize Variables
            date = ""
            from_address = ""
            to_address = ""
            org_filename = ""
            message = ""


            # Get all the lines in the file
            lines = [line.rstrip() for line in file]

            for line in lines:
                if line.startswith("Message-ID: "):
                    message_id = line.replace("Message-ID: ","")

                # Check if email has been flagged as reply or forward to prevent overwriting variables
                if not is_reply_forward:

                    if line.startswith("Date: "):
                        date = line.replace("Date: ","")
                        continue

                    if line.startswith("From: "):
                        from_address = line.replace("From: ","")
                        continue

                    if line.startswith("To: "):
                        to_address = line.replace("To: ","")
                        continue

                    if line.startswith("X-FileName: "):
                        org_filename = line.replace("X-FileName: ","")

                        # Set header flag to false as the X-FileName appears to be the last line in the header
                        header = False
                        continue

                    # Check for the term "Original Message" which indicates the rest of the files contains the message
                    # being forwarded or replied to.
                    if "Original Message" in line:            
                        is_reply_forward = True
                        continue

                    # If we are no longer in the header and wer are not part of a forwarded message
                    # we can consider this line as part of the message
                    if not header:
                        message += "\n" + line
                        continue
        email = {
            'file_path' : file_path, 
            'message_id' : message_id ,
            'date' : date ,
            'from_address' : from_address ,
            'to_address' : to_address ,
            'org_filename' : org_filename ,
            'is_reply_forward' : is_reply_forward,
            'message' : message
        }

        return email
    except UnicodeDecodeError:
        
        email = {
                'file_path' : file_path, 
                'message_id' : "BAD_FILE" ,
                'date' : "BAD_FILE" ,
                'from_address' : "BAD_FILE" ,
                'to_address' : "BAD_FILE" ,
                'org_filename' : "BAD_FILE" ,
                'is_reply_forward' : False ,
                'message' : "BAD_FILE"
            }
        
        return email

## Find email files
If the email files were properly downloaded and extracted using the <a href='gather_enron_dataset.ipynb'>gather_enron_dataset notebook</a> there should be over 500,000 files to process.

In [3]:
# Identify location of xml files
mypath = "data/maildir/"

# create an empty list for the file paths
files = []


for r, d, f in os.walk(mypath):
    for file in f:
        #print(os.path.join(root, file))  
        files.append(os.path.join(r, file))

print("{} files found.".format(len(files)))

517415 files found.


## Extract Data
Extract the meta data and messages from all the documents in the dataset.

In [4]:
%%time
# Create a dataframe of all the emails
df = pd.concat([pd.DataFrame(get_email_meta(files[i]) for i in range(0,len(files)))])

CPU times: user 24.3 s, sys: 3.31 s, total: 27.7 s
Wall time: 27.7 s


In [5]:
%%time
df.to_parquet("data/enron_extracted/email_data.parquet")

CPU times: user 2.68 s, sys: 966 ms, total: 3.65 s
Wall time: 3.58 s
