In [None]:
%%html
<!-- 
If you can see this code, this cell's output is not trusted.
Please execute this cell and save the notebook, or click File -> Trust Notebook
-->
<script>
var shown = true;

function filter_cells_by_tag(tag) {
    out = Array();
    all_cells = Jupyter.notebook.get_cells()
    for (var i=0; i<all_cells.length; i++) {
        var curr_cell = all_cells[i];
        var tags = curr_cell._metadata.tags;
        if (tags != undefined) {
            for (var j=0; j<tags.length; j++) {
                var curr_tag = tags[j];
                if (curr_tag == tag) {
                    out.push(curr_cell);
                    break;
                }
            }
        }
    }
    return out;
}

function set_cell_visibility(tag, show, input_only) {
    var cells = Jupyter.notebook.get_cells();
    var marked_cells = filter_cells_by_tag(tag);
    for (var i=0; i<marked_cells.length; i++) {
        var curr_cell = marked_cells[i];
        if (input_only) {
            obj = curr_cell.input
        } else {
            obj = curr_cell.element
        }
        if (show) {
            obj.show();
        } else {
            obj.hide();
        }
    }
}

function toggle_cell_visibility(tag) {
    set_cell_visibility(tag, shown, false)
    shown = ! shown;
}

set_cell_visibility('execution_cell', false, true);
</script>
To toggle visibility of explanation cells click <a href="javascript:toggle_cell_visibility('explanatory_cell')">here</a>


# Powerpoint Preprocessing

This notebook defines the steps for extracting information from an Powerpoint file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory

To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.

#### Table of Contents

1. [Take a Look at a Powerpoint File](#explore)
1. [Custom Partitioning Bricks](#custom)
1. [Cleaning Bricks](#cleaning)
1. [Staging Bricks](#staging)

## Section 1: Take a Look at a PowerPoint File <a id="explore"></a>

In [1]:
import os
import json


def get_filename(directory, filename):
    cwd = os.getcwd()
    local_directory = os.path.join(os.path.split(cwd)[0], directory)
    ci_directory = os.path.join(cwd, directory)

    if os.path.exists(local_directory) and filename in os.listdir(local_directory):
        return os.path.join(local_directory, filename)
    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):
        return os.path.join(ci_directory, filename)
    else:
        raise FileNotFoundError

In [2]:
filename = get_filename("sample-docs", "fake-power-point-extra.pptx")

In [3]:
import pptx

presentation = pptx.Presentation(filename)

In [4]:
shape = presentation.slides[0].shapes[0]

In [5]:
shapes = [s for s in presentation.slides[0].shapes]

In [6]:
texts = [t.text_frame.paragraphs[0].text for t in shapes]

In [7]:
print(texts)

['Bullet Slide', 'Find the bullet slide layout']


In [8]:
shapes[1].text_frame.paragraphs[2].text

'Use _TextFrame.add_paragraph() for subsequent BULLETS'

In [9]:
shapes[1].text_frame.paragraphs[3].text

'For any comment please send an email to comments@unstructured.io'

In [10]:
shapes[1].text_frame.paragraphs[4].text

'Also, phone assistance is available at +52 55151685'

In [11]:
shapes = [s for s in presentation.slides[1].shapes]

In [12]:
texts = [t.text_frame.paragraphs[0].text for t in shapes]
print(texts)

['Another Bullet Slide', 'Find the bullet slide layout']


## Section 2: Custom Partition Bricks

In [13]:
from unstructured.partition.pptx import partition_pptx

elements = partition_pptx(filename)

In [14]:
print(elements)

[<unstructured.documents.elements.Title object at 0x12fe56a00>, <unstructured.documents.elements.Title object at 0x11bb90250>, <unstructured.documents.elements.Title object at 0x12fe56c40>, <unstructured.documents.elements.Title object at 0x13b4868b0>, <unstructured.documents.elements.NarrativeText object at 0x13b486910>, <unstructured.documents.elements.NarrativeText object at 0x13b4869d0>, <unstructured.documents.elements.PageBreak object at 0x11bb90ac0>, <unstructured.documents.elements.Title object at 0x12fe56df0>, <unstructured.documents.elements.Title object at 0x12fe56b80>, <unstructured.documents.elements.Title object at 0x13b486ac0>, <unstructured.documents.elements.Title object at 0x13b486be0>]


In [15]:
for element in elements:
    print(element.text)

Bullet Slide
Find the bullet slide layout
Use _TextFrame.text for first bullet
Use _TextFrame.add_paragraph() for subsequent BULLETS
For any comment please send an email to comments@unstructured.io
Also, phone assistance is available at +52 55151685
<PAGE BREAK>
Another Bullet Slide
Find the bullet slide layout
Hello
There :)


## Section 3: Cleaning Bricks <a id="cleaning"></a>

In addition to partitioning bricks, the Unstructured library has
***cleaning*** bricks for removing unwanted content from text. In this
case, we'll solve our punctuation problem by using the 
`remove_punctuation`. Other uses for cleaning bricks include
cleaning out boilerplate, sentence fragments, and other segments
of text that could impact labeling tasks or the accuracy of
machine learning models. As with partitioning bricks, users can
include custom cleaning bricks in a pipeline.

In [16]:
#This element has a lot of new line characters
elements[3].text

'Use _TextFrame.add_paragraph() for subsequent BULLETS'

In [17]:
from unstructured.cleaners.core import remove_punctuation,clean
text=elements[3].text
print(f"{text} \n\t\t\t-> {remove_punctuation(text)}")

print(f"{text} \n\t\t\t-> {clean(elements[3].text,lowercase=True)}")


Use _TextFrame.add_paragraph() for subsequent BULLETS 
			-> Use TextFrameaddparagraph for subsequent BULLETS
Use _TextFrame.add_paragraph() for subsequent BULLETS 
			-> use _textframe.add_paragraph() for subsequent bullets


In [19]:
from unstructured.cleaners.extract import extract_email_address,extract_us_phone_number

text = elements[4].text
print(f"{text} \n\t\t\t-> {extract_email_address(text)}")

text = elements[5].text
print(f"{text} \n\t\t\t-> {extract_us_phone_number(elements[5].text)}")


For any comment please send an email to comments@unstructured.io 
			-> ['comments@unstructured.io']
Also, phone assistance is available at +52 55151685 
			-> 55151685


In [20]:
print(elements[4].text)

For any comment please send an email to comments@unstructured.io


## Section 4: Staging Bricks<a id="staging"></a>

In [21]:
elements[2].text

'Use _TextFrame.text for first bullet'

In [22]:
from unstructured.staging.label_studio import stage_for_label_studio

label_studio_data = stage_for_label_studio(elements)
label_studio_data

[{'data': {'text': 'Bullet Slide',
   'ref_id': 'a78cebecf85751415a1cddfc46ce30ee'}},
 {'data': {'text': 'Find the bullet slide layout',
   'ref_id': '3c0332d3515a039dee82e4f3388594c8'}},
 {'data': {'text': 'Use _TextFrame.text for first bullet',
   'ref_id': 'ca8d08c97f0eeb554cac4758c9229614'}},
 {'data': {'text': 'Use _TextFrame.add_paragraph() for subsequent BULLETS',
   'ref_id': '5819d910e92698fc191ba9f5ce1557c7'}},
 {'data': {'text': 'For any comment please send an email to comments@unstructured.io',
   'ref_id': '0aa0b1af34f8eea2e3b613d5292622c9'}},
 {'data': {'text': 'Also, phone assistance is available at +52 55151685',
   'ref_id': '5f845b8555c255f8718877e8ac4924b1'}},
 {'data': {'text': '<PAGE BREAK>',
   'ref_id': '5ea24028ea5addabb8f07dfff681501d'}},
 {'data': {'text': 'Another Bullet Slide',
   'ref_id': '08dde2abae4940a0de4610d133b048a7'}},
 {'data': {'text': 'Find the bullet slide layout',
   'ref_id': '3c0332d3515a039dee82e4f3388594c8'}},
 {'data': {'text': 'Hello', 'r