In [None]:
import os
import pandas as pd
import json
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
from unstructured.partition.html import partition_html
from unstructured.staging.base import elements_to_json
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.msg import partition_msg
from unstructured.partition.image import partition_image

current_dir = '/workspaces/PubSec-Info-Assistant/sandbox'
output_dir = '/workspaces/PubSec-Info-Assistant/sandbox/output'

strategy = "hi_res" # Strategy for analyzing PDFs and extracting table structure
model_name = "yolox" # yolox - best model for table extraction. Other options are detectron2_onnx and chipper depending on file layout
languages = "eng" # https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html. Common items: eng / ara

In [None]:
def process_elements(input_filename):
    # Read the JSON file
    with open(input_filename, 'r') as file:
        data = json.load(file)
    print(input_filename)
                
    prefix = '''<!DOCTYPE html>
            <html>
            <head>
            <style>
            table {
                border-collapse: collapse;
                border: 1px solid black;
            }

            th, td {
                border: 1px solid black;
                padding: 8px;
                text-align: left;
            }
            </style><BR<BR>'''   
    
    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for index, entry in enumerate(data): 
        if index == 0:
            extracted_elements.append(prefix)        
            
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])
        elif entry["type"] == "Title":
            extracted_elements.append('<h1>'+entry["text"]+'</h1>')
        elif entry["type"] == "UncategorizedText" or entry["type"] == "NarrativeText":
            extracted_elements.append('<p>'+entry["text"]+'</p>')
        elif entry["type"] == "FigureCaption":
            extracted_elements.append('<p>FigureCaption - '+entry["text"]+'</p>')    
        else:
            extracted_elements.append(f'<p>************* {entry["type"]} - '+entry["text"]+'</p>') 
               
    # Write the extracted elements to the output file
    with open(input_filename+".html", 'w') as output_file:
        for element in extracted_elements:
            output_file.write(element   + "<BR><BR>\n")  # Adding two newlines for separation   

In [None]:
# *******
# PDF
# *******

file_pdf_1 = 'pdf/msft-10k_20190630.htm.pdf'
file_pdf_2 = 'example-docs/layout-parser-paper-fast.pdf'
file_pdf_3 = 'example-docs/multi-column.pdf'
file_pdf_5 = 'nvidia/Q2FY24-CFO-Commentary.pdf'
file_pdf_6 = 'pdf/Benefit_Options.pdf'
file_pdf_7 = 'pdf/arabic_1.pdf'

filename = file_pdf_6
# file_path = os.path.join(current_dir, filename)

elements = partition_pdf(
    filename=filename, 
    strategy=strategy, 
    infer_table_structure=True, 
    model_name=model_name,      # comment this line out to use default model
    ocr_languages=languages
)

In [None]:
# *******
# HTML
# *******
file_html_1 = 'html/111.html'
file_html_2 = 'html/sample.html'

filename = file_html_1
# file_path = os.path.join(current_dir, filename)
elements = partition_html(
    filename=filename
)
print('done')


In [54]:
# *******
# PowerPoint
# *******
file_pptx_1 = 'powerpoint/IA_deck.pptx'

filename = file_pptx_1
# file_path = os.path.join(current_dir, filename)
elements = partition_pptx(
    filename=filename
)
print('done')

done


In [None]:
# *******
# eml
# *******
# from unstructured.partition.msg import partition_msg
#from unstructured.partition.email import partition_email


file_msg_3 = 'email/email-replace-mime-encodings-error-4.eml'

filename = file_msg_3
elements = partition_email(filename=filename)
print('done')



In [None]:
# *******
# auto
# *******
from unstructured.partition.msg import partition_msg

file_msg_1 = 'email/Build_hero.msg'
file_msg_2 = 'email/Test email - disregard.msg'
file_msg_3 = 'email/fake-email.msg'

filename = file_msg_2
elements = partition(filename=filename)
print('done')
print(elements)


In [None]:
# *******
# msg
# *******
from unstructured.partition.msg import partition_msg

file_msg_1 = 'email/Build_hero.msg'
file_msg_2 = 'email/Test email - disregard.msg'
file_msg_3 = 'email/fake-email.msg'


filename = file_msg_2
elements = partition_msg(filename=filename)
print('done')


In [None]:
# *******
# Images
# *******
file_image_1 = 'image/street.jpeg'
file_image_2 = 'image/example.jpg'
file_image_3 = 'image/layout-parser-paper-fast.jpg'

filename = file_image_3
elements = partition_image(
    filename=filename
)
print('done')

In [None]:
# *******
# Docx
# *******
file_docx_1 = 'docs/war-and-peace.docx'


filename = file_docx_1
elements = partition_docx(
    filename=filename
)
print('done')

In [55]:

file_path = os.path.join(current_dir, filename)
directory, file_component = os.path.split(file_path)
output_path = os.path.join(output_dir, file_component)

elements_to_json(elements, filename=f"{output_path}.json") 
process_elements(f"{output_dir}/{file_component}.json")
print('done')


/workspaces/PubSec-Info-Assistant/sandbox/output/IA_deck.pptx.json
done


In [None]:
# Convert elements to a DataFrame
data = []
for element in elements:
    data.append((element, element.category))
df = pd.DataFrame(data, columns=['text', 'category']) 

# Get unique values in the 'category' column
unique_categories = df['category'].unique()

print("Unique categories:")
for category in unique_categories:
    print(category)
print("")


In [None]:
# Convert elements to a DataFrame
data = []
for element in elements:
    data.append((element.category, element.metadata.page_number, element.metadata.section, element))
df = pd.DataFrame(data, columns=[ 'category', 'page_number', 'section', 'text']) 

In [None]:
# Get unique values in the 'category' column
unique_categories = df['category'].unique()

print("Unique categories:")
for category in unique_categories:
    print(category)

In [None]:
query = "category == 'Table'"  # Modify this condition as needed
reduced_df = df.query(query)

# Print the reduced DataFrame in full
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(reduced_df)

In [None]:
# Save the  DataFrame as a CSV file
csv_filename = 'dataframe.csv'
csv_path = os.path.join(current_dir, csv_filename)
df.to_csv(csv_path, index=False)