# Table Detection Demo
The purpose of this demo is to propose different table extraction formats to the client:
1. Table as a 2D json array with texts represented as a string
2. Table as a 1D json array with texts represented as a string and cell bounding box coordinates

# Install Requirements

In [13]:
# !sudo yum install ghostscript -y
# !sudo yum update -y
# !sudo yum install ImageMagick-devel -y
# !pip install pdfplumber

# Imports

In [14]:
import pandas as pd
import pdfplumber
import json
import layoutparser as lp
import os
pdfplumber.__version__

'0.6.0'

# Extract Tables from Sample SOPs
Here we try to extract tables from sample documents suggested by the client.
1. LAB-2352 (Andover) - suggested by the client
2. MCD-088122 (Kalamazoo) - suggested by us to show complex table examples

In [15]:
def cell_to_textblock(cell, text):
    return lp.TextBlock(
        block=lp.Rectangle(
            x_1=cell[0],
            y_1=cell[1],
            
            x_2=cell[2],
            y_2=cell[3],
        ),
        text=text,
        id=None,
        type='CELL',
        parent=None,
        next=None,
        score=None,
    )

In [16]:
def table_to_layout(table, table_texts):
    textblocks = []
    for row, row_texts in zip(table.rows, table_texts):
        for cell, cell_text in zip(row.cells, row_texts):
            if cell:
                textblocks.append(cell_to_textblock(cell,  cell_text))
    
    layout = lp.Layout(textblocks)
    
    return layout

In [17]:
def images_to_pdf(images, filename):
    all_images = []
    for images_per_page in images:
        all_images.extend(images_per_page)
        
    if all_images:
        all_images[0].save(filename, "PDF" ,resolution=100.0, save_all=True, append_images=all_images[1:])

In [18]:
def tables_to_csv(tables, dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
        
    for page_idx, page_tables in enumerate(tables, start=1):
        for table_idx, table in enumerate(page_tables, start=1):
            table.to_csv(f"{dir}/table_{page_idx}_{table_idx}.csv")

In [19]:
def tables_to_json(tables, dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
        
    for page_idx, page_tables in enumerate(tables, start=1):
        for table_idx, table in enumerate(page_tables, start=1):
            with open(f"{dir}/table_{page_idx}_{table_idx}.json", 'w') as file:
                json.dump(table, file)

In [20]:
TABLE_EXTRACTION_SETTINGS = {'snap_tolerance':4, 'edge_min_length':30}

In [42]:
def detect_tables(filename):
    pdf_layout, pdf_images = lp.load_pdf(filename, load_images=True)
    pdf = pdfplumber.open(filename)
    
    pdf_images_with_table_boxes = []
    table_dfs_per_page = []
    table_texts_per_page = []
    table_jsons_bbox_per_page = []
    for page, page_img in zip(pdf.pages, pdf_images):
        page_table_finder = page.debug_tablefinder(table_settings=TABLE_EXTRACTION_SETTINGS)
        page_tables = page_table_finder.tables
        page_tables_texts = page.extract_tables(table_settings=TABLE_EXTRACTION_SETTINGS)
        
        _images_with_table_boxes = []
        _dfs_per_page = []
        _texts_per_page = []
        _jsons_bbox_per_page = []
        for table, table_texts in zip(page_tables, page_tables_texts):
            # create layout from table to draw bounding boxes
            table_layout = table_to_layout(table, table_texts)
            table_layout.page_data.update({"width":page_img.size[0], "height":page_img.size[1]})
            page_img_with_table_boxes = lp.draw_box(page_img, table_layout, box_width=2)
            _images_with_table_boxes.append(page_img_with_table_boxes)
            
            # convert to dataframe
            table_df = pd.DataFrame(table_texts)
            _dfs_per_page.append(table_df)
            
            # convert to json texts
            _texts_per_page.append(table_texts)
            
            # convert to json texts and bounding boxes
            _jsons_bbox_per_page.append(table_layout.to_dict())
        
        pdf_images_with_table_boxes.append(_images_with_table_boxes)
        table_dfs_per_page.append(_dfs_per_page)
        table_texts_per_page.append(_texts_per_page)
        table_jsons_bbox_per_page.append(_jsons_bbox_per_page)
        
    return pdf_images_with_table_boxes, table_dfs_per_page, table_texts_per_page, table_jsons_bbox_per_page

In [40]:
SOP_DIR = "data/sop"
RESULTS_DIR = "Table Extraction Demo 1"

SOPS_INFO = [
    {"type": "Andover", "id":"LAB-2352"},
    {"type": "Kalamazoo", "id":"MCD-088122"},
    {"type": "UNK", "id":"19897"},

]

In [43]:
for sop in SOPS_INFO:
    _subdir =   os.path.join(RESULTS_DIR, f"{sop['type']}_{sop['id']}")
    _sop_file_path = os.path.join(SOP_DIR, sop['type'], f"{sop['id']}.pdf")
    _sop_pdf_tables_file = os.path.join(_subdir, f"{sop['id']}.pdf")
    _sop_csv_tables_dir = os.path.join(_subdir, "csv")
    _sop_json_table_texts_dir = os.path.join(_subdir, "text")
    _sop_json_table_bbox_dir = os.path.join(_subdir, "bbox")
    if not os.path.exists(_subdir):
        os.makedirs(_subdir)
    
    images_with_table_boxes, table_dfs, table_texts, table_bboxes = detect_tables(filename=_sop_file_path)
    
    images_to_pdf(images=images_with_table_boxes, filename=_sop_pdf_tables_file)
    tables_to_csv(tables=table_dfs, dir=_sop_csv_tables_dir)
    tables_to_json(tables=table_texts, dir=_sop_json_table_texts_dir)
    tables_to_json(tables=table_bboxes, dir=_sop_json_table_bbox_dir)

In [45]:
!zip -r demo.zip Table\ Extraction\ Demo\ 1 

  adding: Table Extraction Demo 1/ (stored 0%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/ (stored 0%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/ (stored 0%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_5_2.csv (deflated 58%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_5_3.csv (stored 0%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_3_2.csv (deflated 53%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_14_2.csv (deflated 63%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_21_5.csv (deflated 23%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_5_1.csv (deflated 30%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_16_1.csv (deflated 30%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_17_1.csv (deflated 30%)
  adding: Table Extraction Demo 1/Kalamazoo_MCD-088122/csv/table_10_2.csv (deflated 64%)
  adding: Table Ext