In [2]:
%%html
<!-- 
If you can see this code, this cell's output is not trusted.
Please execute this cell and save the notebook, or click File -> Trust Notebook
-->
<script>
var shown = true;

function filter_cells_by_tag(tag) {
    out = Array();
    all_cells = Jupyter.notebook.get_cells()
    for (var i=0; i<all_cells.length; i++) {
        var curr_cell = all_cells[i];
        var tags = curr_cell._metadata.tags;
        if (tags != undefined) {
            for (var j=0; j<tags.length; j++) {
                var curr_tag = tags[j];
                if (curr_tag == tag) {
                    out.push(curr_cell);
                    break;
                }
            }
        }
    }
    return out;
}

function set_cell_visibility(tag, show, input_only) {
    var cells = Jupyter.notebook.get_cells();
    var marked_cells = filter_cells_by_tag(tag);
    for (var i=0; i<marked_cells.length; i++) {
        var curr_cell = marked_cells[i];
        if (input_only) {
            obj = curr_cell.input
        } else {
            obj = curr_cell.element
        }
        if (show) {
            obj.show();
        } else {
            obj.hide();
        }
    }
}

function toggle_cell_visibility(tag) {
    set_cell_visibility(tag, shown, false)
    shown = ! shown;
}

set_cell_visibility('execution_cell', false, true);
</script>
To toggle visibility of explanation cells click <a href="javascript:toggle_cell_visibility('explanatory_cell')">here</a>


# SEC Filing Section Pipeline

This notebook defines the pipeline for extracting the narrative text sections
from the 10-K, 10-Q, and S-1 filings. This notebook contains both
exploration code and the code for defining the API. Code cells marked
with `#pipeline-api` are included in the API definition.

To demonstrate how off-the-shelf Unstructured Bricks extract
meaningful data from complex source documents, we will apply
a series of Bricks with explanations before defining the API.

#### Table of Contents

1. [Pulling in Raw Documents](#raw)
1. [Reading the Document](#reading)
1. [Custom Partitioning Bricks](#custom)
1. [Cleaning Bricks](#cleaning)
1. [Staging Bricks](#staging)
1. [Define the Pipeline API](#pipeline)

## Section 1: Open Raw Documents <a id="raw"></a>

First, let's pull in the `.eml` files from a local directory

In [1]:
directory = '../../sample_documents/sample-pdfs-from-breaches/'

In [2]:
from pathlib import Path
all_paths = []
files = Path("../../sample_documents/sample-eml-from-breaches/").glob('*.eml')
for file in files:
    all_paths.append(str(file))
print(all_paths)

['../../sample_documents/sample-eml-from-breaches/2093.eml', '../../sample_documents/sample-eml-from-breaches/768.eml', '../../sample_documents/sample-eml-from-breaches/2939.eml', '../../sample_documents/sample-eml-from-breaches/3399.eml', '../../sample_documents/sample-eml-from-breaches/2087.eml', '../../sample_documents/sample-eml-from-breaches/2911.eml', '../../sample_documents/sample-eml-from-breaches/754.eml', '../../sample_documents/sample-eml-from-breaches/998.eml', '../../sample_documents/sample-eml-from-breaches/740.eml', '../../sample_documents/sample-eml-from-breaches/2905.eml', '../../sample_documents/sample-eml-from-breaches/1559.eml', '../../sample_documents/sample-eml-from-breaches/2050.eml', '../../sample_documents/sample-eml-from-breaches/3428.eml', '../../sample_documents/sample-eml-from-breaches/2736.eml', '../../sample_documents/sample-eml-from-breaches/973.eml', '../../sample_documents/sample-eml-from-breaches/967.eml', '../../sample_documents/sample-eml-from-breac

In [3]:
from email import policy
from email.parser import BytesParser
from email.message import EmailMessage
from pathlib import Path
from typing import List, Text

def read_email(directory: Text) -> List[EmailMessage]:
    """Input: `.eml` file
        Return: `email.message.EmailMessage`
    """

    all_paths = []
    files = Path(directory).glob('*.eml')
    for file in files:
        all_paths.append(str(file))

    messages = []
    for path in all_paths:
        with open(path, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
            messages.append(msg)
        fp.close()
        
    return messages

messages = read_email("../../sample_documents/sample-eml-from-breaches/")

In [4]:
messages[2]

<email.message.EmailMessage at 0x108226700>

In [5]:
#Header
for i in messages[2].raw_items():
    print(i)

('Received', 'from GOVEXCH.naoero.local (10.10.10.9) by NPFEXH01.NPF.local\n (10.2.2.30) with Microsoft SMTP Server (TLS) id 14.3.169.1; Wed, 15 Jul 2015\n 11:11:17 +1200')
('Received', 'from ADMJoanna (10.1.5.101) by GOVEXCH.naoero.local (10.10.10.9)\n with Microsoft SMTP Server id 14.1.218.12; Wed, 15 Jul 2015 11:17:29 +1200')
('From', 'Joanna olsson <joanna.olsson@naurugov.nr>')
('To', '"gio.nauru@gmail.com" <gio.nauru@gmail.com>')
('Subject', 'NRC: Quarterly Rent Payment')
('Thread-Topic', 'Quarterly Rent Payment')
('Thread-Index', 'AdC+i1RjhD++01plTTG3PAXz6Lzwhg==')
('X-MS-Exchange-MessageSentRepresentingType', '1')
('Date', 'Tue, 14 Jul 2015 23:18:10 +0000')
('Message-ID', '<005201d0be8b$59858ac0$0c90a040$@naurugov.nr>')
('Content-Language', 'en-US')
('X-MS-Exchange-Organization-AuthAs', 'Anonymous')
('X-MS-Exchange-Organization-AuthSource', 'NPFEXH01.NPF.local')
('X-MS-Has-Attach', 'yes')
('X-Auto-Response-Suppress', 'DR, OOF, AutoReply')
('X-MS-TNEF-Correlator', '')
('X-MS-Exch

In [6]:
messages[2].get_boundary()


'_004_005201d0be8b59858ac00c90a040naurugovnr_'

In [7]:
#Email body
messages[2].get_body(preferencelist=('plain')).get_content()

'On Behalf of Nauru Rehabilitation Corporation\r\n\r\nPublic Notice: Quarterly Rent Payout\r\n\r\n\r\n\r\nNauru Rehabilitation Corporation would like to announce to general public that it will be paying out quarterly rent from today Wednesday 15th July.\r\n\r\n\r\n\r\nPlace :  NRC Headquarters, Civic Centre Complex, Aiwo.\r\n\r\n\r\n\r\nTime 10:30am - 3:30pm\r\n\r\n\r\n----------------------------------------------------------------------------\r\nCirculated by the Nauru Government Information Office on behalf of:\r\nNauru Rehabilitation Corporation\r\nPhone: 5573223\r\n\r\nGIO, Yaren District, Government Offices, Republic of Nauru / Mobile: +674 557 3009 / Em: director.information@naurugov.nr<mailto:director.information@naurugov.nr> / gio.nauru@gmail.com<mailto:gio.nauru@gmail.com> / www.naurugov.nr<http://www.naurugov.nr>\r\n\r\n\r\n'

In [8]:
messages[2].get_all("Received")

['from GOVEXCH.naoero.local (10.10.10.9) by NPFEXH01.NPF.local (10.2.2.30) with Microsoft SMTP Server (TLS) id 14.3.169.1; Wed, 15 Jul 2015 11:11:17 +1200',
 'from ADMJoanna (10.1.5.101) by GOVEXCH.naoero.local (10.10.10.9) with Microsoft SMTP Server id 14.1.218.12; Wed, 15 Jul 2015 11:17:29 +1200']

In [9]:
#New Data Structure

from abc import ABC
import hashlib
from typing import Union


class NoID(ABC):
    """Class to indicate that an element do not have an ID."""

    pass


class EmailElement(ABC):
    """An element is a section of the email."""

    def __init__(self, element_id: Union[str, NoID] = NoID()):
        self.id: Union[str, NoID] = element_id


class Text(EmailElement):
    """Base element for capturing free text from within document."""

    category = "Uncategorized"

    def __init__(self, text: str, element_id: Union[str, NoID] = NoID()):
        self.text: str = text

        if isinstance(element_id, NoID):
            # NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
            element_id = hashlib.sha256(text.encode()).hexdigest()[:32]

        super().__init__(element_id=element_id)

    def __str__(self):
        return self.text

    def __eq__(self, other):
        return self.text == other.text
    
class Name(EmailElement):
    """Base element for capturing free text from within document."""

    category = "Uncategorized"

    def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
        self.text: str = text
        self.name: str = name

        if isinstance(element_id, NoID):
            # NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
            element_id = hashlib.sha256(text.encode()).hexdigest()[:32]

        super().__init__(element_id=element_id)
        

    def __str__(self):
        return f"{self.name}: {self.text}"

    def __eq__(self, other):
        return self.name == other.name and self.text == other.text


class BodyText(Text):
    """BodyText is an element consisting of multiple, well-formulated sentences. This
    excludes elements such titles, headers, footers, and captions. It is the body of an email."""

    category = "BodyText"

    pass


class ToHeader(Text):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "ToHeader"

    pass

class FromHeader(Text):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "FromHeader"

    pass

class SubjectHeader(Text):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "SubjectHeader"

    pass

class ReceivedHeader(Text):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "ReceivedHeader"

    pass

class MetaDataHeader(Name):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "MetaDataHeader"

    pass

class Attachment(Text):
    """A text element for capturing header information of an email (e.g. Subject, 
    To, From, etc)."""

    category = "Attachment"

    pass

## Section 2: Parse Email

In [10]:
def eml_to_elements(message:[EmailMessage]) -> List[Text]:
    elements: List[Text] = list()
    
    for msg in message.raw_items():
        if msg[0] == "Received":
            elements.append(ReceivedHeader(text=msg[1]))
        elif msg[0] == "To":
            elements.append(ToHeader(text=msg[1]))
        elif msg[0] == "From":
            elements.append(FromHeader(text=msg[1]))
        elif msg[0] == "Subject":
            elements.append(SubjectHeader(text=msg[1]))
        else:
            elements.append(MetaDataHeader(name=msg[0], text=msg[1]))
    # Get email body
    elements.append(BodyText(text=message.get_body(preferencelist=('plain')).get_content()))
    return elements

In [11]:
elements = eml_to_elements(messages[105])

## Section 3: Cleaning Bricks

In [13]:
from functools import partial

from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text

cleaners = [
        partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
        partial(translate_text, target_lang="ru"),
    ]
name_element = Text(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")
name_element.apply(*cleaners)

print(name_element)

ModuleNotFoundError: No module named 'unstructured.cleaners.translate'

In [736]:
print(elements[4].id)

5038dde0aea062286399d0e1e9763d17


In [693]:
elements[0].text

'from NPFEXH01.NPF.local ([fe80::49b9:5557:61d2:64a3]) by\n NPFEXH01.NPF.local ([fe80::49b9:5557:61d2:64a3%15]) with mapi id\n 14.03.0169.001; Thu, 31 Aug 2017 14:30:10 +1200'

In [671]:
from unstructured.cleaners.core import clean_extra_whitespace

rec_text = clean_extra_whitespace(elements[0].text)

In [672]:
def extract_ip_information(text):
    ip_info = re.findall(r"[a-zA-Z-0-9]*\.[a-zA-Z-0-9]*\.[a-zA-Z-0-9.]*\.?[0-9]*", text)
    if len(ip_info) < 5:
        ip_info += re.findall(r"[a-zA-Z0-9]*[::|:|.][a-zA-Z0-9]*[::|:|.][a-zA-Z0-9]*[::|:|.][a-zA-Z0-9]*[::|:|.][a-zA-Z0-9]*[::|:|.][a-zA-Z0-9]*", text)
   
    return ip_info

In [624]:
extract_ip_information(rec_text)

['NPFEXH01.NPF.local',
 'NPFEXH01.NPF.local',
 '14.03.0169.001',
 'fe80::49b9:5557:61d2:64a3',
 'fe80::49b9:5557:61d2:64a3']

In [656]:
def extract_received_date(text):
    date = re.findall(r"[a-zA-Z]*\,\s[0-9]{1,2}\s[a-zA-Z]*\s[0-9]{4}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s\+[0-9]{4}", text)[0]
    return date.replace(",", "").split(" ")

In [657]:
extract_received_date(rec_text)

['Thu', '31', 'Aug', '2017', '14:30:10', '+1200']