In [None]:
%%html
<!-- 
If you can see this code, this cell's output is not trusted.
Please execute this cell and save the notebook, or click File -> Trust Notebook
-->
<script>
var shown = true;

function filter_cells_by_tag(tag) {
    out = Array();
    all_cells = Jupyter.notebook.get_cells()
    for (var i=0; i<all_cells.length; i++) {
        var curr_cell = all_cells[i];
        var tags = curr_cell._metadata.tags;
        if (tags != undefined) {
            for (var j=0; j<tags.length; j++) {
                var curr_tag = tags[j];
                if (curr_tag == tag) {
                    out.push(curr_cell);
                    break;
                }
            }
        }
    }
    return out;
}

function set_cell_visibility(tag, show, input_only) {
    var cells = Jupyter.notebook.get_cells();
    var marked_cells = filter_cells_by_tag(tag);
    for (var i=0; i<marked_cells.length; i++) {
        var curr_cell = marked_cells[i];
        if (input_only) {
            obj = curr_cell.input
        } else {
            obj = curr_cell.element
        }
        if (show) {
            obj.show();
        } else {
            obj.hide();
        }
    }
}

function toggle_cell_visibility(tag) {
    set_cell_visibility(tag, shown, false)
    shown = ! shown;
}

set_cell_visibility('execution_cell', false, true);
</script>
To toggle visibility of explanation cells click <a href="javascript:toggle_cell_visibility('explanatory_cell')">here</a>


# Email Preprocessing

This notebook defines the steps for extracting the different components (header, body, attachments, etc.) of an email (`.eml` file). To see how to create a generalized API for all documents see `pipeline-general` directory.

To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.

#### Table of Contents

1. [Take a Look at a Raw EML File](#explore)
1. [Custom Partitioning Bricks](#custom)
1. [Cleaning Bricks](#cleaning)
1. [Staging Bricks](#staging)

## Section 1: Take a Look at a Raw EML File <a id="explore"></a>

Let's take a look at an email with an attachment. As you will see below there is metadata about the email at the top (sender, recipient, subject, etc.) and if you scroll down, you will will see there are different sections of the email and it's metadata. There is one part `X-MS-Has-Attach: yes` which indicates this email has an attachment. 

In [None]:
import os
import json


def get_filename(directory, filename):
    cwd = os.getcwd()
    local_directory = os.path.join(os.path.split(cwd)[0], directory)
    ci_directory = os.path.join(cwd, directory)

    if os.path.exists(local_directory) and filename in os.listdir(local_directory):
        return os.path.join(local_directory, filename)
    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):
        return os.path.join(ci_directory, filename)
    else:
        raise FileNotFoundError

In [None]:
filename = get_filename("sample-docs", "fake-email-attachment.eml")

In [None]:
import email

with open(filename) as f:
    msg = email.message_from_file(f)

In [None]:
# Take a look at the eml file with all the metadata and content
for part in msg.walk():
    print(part)

MIME-Version: 1.0
Date: Fri, 23 Dec 2022 12:08:48 -0600
Message-ID: <CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>
Subject: Fake email with attachment
From: Mallori Harrell <mallori@unstructured.io>
To: Mallori Harrell <mallori@unstructured.io>
Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7"

--0000000000005d654405f082adb7
Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5"

--0000000000005d654205f082adb5
Content-Type: text/plain; charset="UTF-8"

Hello!

Here's the attachments!

It includes:

   - Lots of whitespace
   - Little to no content
   - and is a quick read

Best,

Mallori

--0000000000005d654205f082adb5
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

<div dir=3D"ltr">Hello!=C2=A0<div><br></div><div>Here&#39;s the attachments=
!</div><div><br></div><div>It includes:</div><div><ul><li style=3D"margin-l=
eft:15px">Lots of whitespace</li><li style=3D"margin-left:15px">Litt

In [None]:
# Take a closer look at the header section of the eml file
for part in msg.raw_items():
    print(part)

('MIME-Version', '1.0')
('Date', 'Fri, 23 Dec 2022 12:08:48 -0600')
('Message-ID', '<CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>')
('Subject', 'Fake email with attachment')
('From', 'Mallori Harrell <mallori@unstructured.io>')
('To', 'Mallori Harrell <mallori@unstructured.io>')
('Content-Type', 'multipart/mixed; boundary="0000000000005d654405f082adb7"')


## Section 2: Custom Partition Bricks

Let's take a look at the body text of the eml file.

In [None]:
from unstructured.partition.email import partition_email

elements = partition_email(filename=filename)

In [None]:
elements

[<unstructured.documents.html.HTMLTitle>,
 <unstructured.documents.html.HTMLNarrativeText>,
 <unstructured.documents.html.HTMLNarrativeText>,
 <unstructured.documents.html.HTMLListItem>,
 <unstructured.documents.html.HTMLListItem>,
 <unstructured.documents.html.HTMLListItem>,
 <unstructured.documents.html.HTMLText>,
 <unstructured.documents.html.HTMLTitle>]

In [None]:
print(elements[0].text)
print(elements[1].text)
print(elements[2].text)
print(elements[3].text)
print(elements[4].text)
print(elements[5].text)
print(elements[6].text)
print(elements[7].text)

Hello! 
Here's the attachments!
It includes:
Lots of whitespace
Little to no content
and is a quick read
Best,
Mallori


In [None]:
for element in elements:
    print(element)

Hello! 
Here's the attachments!
It includes:
Lots of whitespace
Little to no content
and is a quick read
Best,
Mallori


We can use the same code with extra parameters to also extract the header of the eml file

In [None]:
elements_with_header = partition_email(filename=filename, include_headers=True)

Let's also extract the attachment from the eml file. We can extract the file's metadata and payload. You can save the attachment to your local drive by specifying a directory for the `output_dir` paramenter.

In [None]:
from unstructured.partition.email import extract_attachment_info
with open(filename) as f:
    msg = email.message_from_file(f)
    
attachments = extract_attachment_info(msg)

In [None]:
attachments

[{'filename': 'fake-attachment.txt',
  'payload': b'Hey this is a fake attachment!'}]

## Section 3: Cleaning Bricks <a id="cleaning"></a>

In addition to partitioning bricks, the Unstructured library has
***cleaning*** bricks for removing unwanted content from text. In this
case, we'll solve our whitespace problem by using the 
`clean_extra_whitespace`. Other uses for cleaning bricks include
cleaning out boilerplate, sentence fragments, and other segments
of text that could impact labeling tasks or the accuracy of
machine learning models. As with partitioning bricks, users can
include custom cleaning bricks in a pipeline.

In [None]:
#This element has a lot of new line characters
elements[0].text

'Hello!\xa0'

In [None]:
from functools import partial
from unstructured.cleaners.core import clean_extra_whitespace, remove_punctuation

clean_extra_whitespace(elements[0].text)

'Hello!'

In [None]:
elements[2].text

'It includes:'

In [None]:
remove_punctuation(elements[2].text)

'It includes'

In [None]:
clean_elements = []
cleaners = [clean_extra_whitespace, remove_punctuation]
for element in elements:
    element.apply(*cleaners)
    clean_elements.append(element)

## Section 4: Staging Bricks<a id="staging"></a>

In [None]:
clean_elements[0].text

'Hello'

In [None]:
clean_elements[2].text

'It includes'

In [None]:
from unstructured.staging.label_studio import stage_for_label_studio

label_studio_data = stage_for_label_studio(clean_elements)
label_studio_data

[{'data': {'text': 'Hello', 'ref_id': '924fccbb252c7027c692cab39aa9c952'}},
 {'data': {'text': 'Heres the attachments',
   'ref_id': 'cfa6629abfb0222b3b4b89ed9333280a'}},
 {'data': {'text': 'It includes',
   'ref_id': '3fe3ec352ef9c8089ee5feb6ddf8d324'}},
 {'data': {'text': 'Lots of whitespace',
   'ref_id': '0cd12c1692d24b6ceaed1baaf82d6186'}},
 {'data': {'text': 'Little to no content',
   'ref_id': 'b11ebd7a352bca2b850f79cb100591de'}},
 {'data': {'text': 'and is a quick read',
   'ref_id': '1f4fcaa93cbb457d397235afcb380953'}},
 {'data': {'text': 'Best', 'ref_id': 'b69770ef35263fe11a6796b022b66698'}},
 {'data': {'text': 'Mallori', 'ref_id': 'b594cbd758a427db6c4a4a967e893e23'}}]