In [None]:
%%html
<!-- 
If you can see this code, this cell's output is not trusted.
Please execute this cell and save the notebook, or click File -> Trust Notebook
-->
<script>
var shown = true;

function filter_cells_by_tag(tag) {
    out = Array();
    all_cells = Jupyter.notebook.get_cells()
    for (var i=0; i<all_cells.length; i++) {
        var curr_cell = all_cells[i];
        var tags = curr_cell._metadata.tags;
        if (tags != undefined) {
            for (var j=0; j<tags.length; j++) {
                var curr_tag = tags[j];
                if (curr_tag == tag) {
                    out.push(curr_cell);
                    break;
                }
            }
        }
    }
    return out;
}

function set_cell_visibility(tag, show, input_only) {
    var cells = Jupyter.notebook.get_cells();
    var marked_cells = filter_cells_by_tag(tag);
    for (var i=0; i<marked_cells.length; i++) {
        var curr_cell = marked_cells[i];
        if (input_only) {
            obj = curr_cell.input
        } else {
            obj = curr_cell.element
        }
        if (show) {
            obj.show();
        } else {
            obj.hide();
        }
    }
}

function toggle_cell_visibility(tag) {
    set_cell_visibility(tag, shown, false)
    shown = ! shown;
}

set_cell_visibility('execution_cell', false, true);
</script>
To toggle visibility of explanation cells click <a href="javascript:toggle_cell_visibility('explanatory_cell')">here</a>


# Powerpoint Preprocessing

This notebook defines the steps for extracting information from an Powerpoint file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory

To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.

#### Table of Contents

1. [Take a Look at a Powerpoint File](#explore)
1. [Custom Partitioning Bricks](#custom)
1. [Cleaning Bricks](#cleaning)
1. [Staging Bricks](#staging)

## Section 1: Take a Look at a Excel File <a id="explore"></a>

In [None]:
import os
import json


def get_filename(directory, filename):
    cwd = os.getcwd()
    local_directory = os.path.join(os.path.split(cwd)[0], directory)
    ci_directory = os.path.join(cwd, directory)

    if os.path.exists(local_directory) and filename in os.listdir(local_directory):
        return os.path.join(local_directory, filename)
    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):
        return os.path.join(ci_directory, filename)
    else:
        raise FileNotFoundError

In [None]:
filename = get_filename("sample-docs", "fake-power-point.pptx")

In [None]:
import pptx

presentation = pptx.Presentation(filename)

In [None]:
shape = presentation.slides[0].shapes[0]

In [None]:
text = shape.text_frame.paragraphs[0].text

In [None]:
print(text)

Adding a Bullet Slide


## Section 2: Custom Partition Bricks

In [None]:
from unstructured.partition.pptx import partition_pptx

elements = partition_pptx(filename)

In [None]:
print(elements)

[<unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>]


In [None]:
for element in elements:
    print(element.text)

Adding a Bullet Slide
Find the bullet slide layout
Use _TextFrame.text for first bullet
Use _TextFrame.add_paragraph() for subsequent bullets


## Section 3: Cleaning Bricks <a id="cleaning"></a>

In addition to partitioning bricks, the Unstructured library has
***cleaning*** bricks for removing unwanted content from text. In this
case, we'll solve our punctuation problem by using the 
`remove_punctuation`. Other uses for cleaning bricks include
cleaning out boilerplate, sentence fragments, and other segments
of text that could impact labeling tasks or the accuracy of
machine learning models. As with partitioning bricks, users can
include custom cleaning bricks in a pipeline.

In [None]:
#This element has a lot of new line characters
elements[3].text

'Use _TextFrame.add_paragraph() for subsequent bullets'

In [None]:
from unstructured.cleaners.core import remove_punctuation

remove_punctuation(elements[3].text)

'Use TextFrameaddparagraph for subsequent bullets'

## Section 4: Staging Bricks<a id="staging"></a>

In [None]:
elements[2].text

'Use _TextFrame.text for first bullet'

In [None]:
from unstructured.staging.label_studio import stage_for_label_studio

label_studio_data = stage_for_label_studio(elements)
label_studio_data

[{'data': {'text': 'Adding a Bullet Slide',
   'ref_id': '50b70366a51804855c6dd48a3865cb87'}},
 {'data': {'text': 'Find the bullet slide layout',
   'ref_id': '3c0332d3515a039dee82e4f3388594c8'}},
 {'data': {'text': 'Use _TextFrame.text for first bullet',
   'ref_id': 'ca8d08c97f0eeb554cac4758c9229614'}},
 {'data': {'text': 'Use _TextFrame.add_paragraph() for subsequent bullets',
   'ref_id': '83d53564b64b558f77c7c33b5a029213'}}]