# Text Extractor for Table

In [1]:
import os
import glob
import pandas
import numpy as np
from bs4 import BeautifulSoup

In [3]:
guidance_dir = '../guidance_page_collection/guidance_pages_annotated/'
NUM = 59

In [2]:
output_dir_path = 'extracted_texts/'

In [4]:
def convert_table_to_text(data_frame):
    results = ''

    if (isinstance(data_frame.columns, pandas.core.indexes.numeric.Int64Index)):
        header = [ data_frame[j][0] for j in data_frame.columns ]
        body_offset = 1
    else:
        header = [ item for item in data_frame.columns ]
        body_offset = 0

    data_header_found = False
    for val in header:
        if (isinstance(val, np.floating)): continue 
        if (isinstance(val, tuple)): continue
        if (val.lower().find('[[[{dat}category]]]') > -1 or
            val.lower().find('[[[{dat}data category]]]') > -1 or  # 47
            val.lower().find('[[[{dat}play category]]]') > -1 or  # 27
            val.lower().find('[[[{dat}data]]]') > -1 or 
            val.lower().find('[[[{dat}data type]]]') > -1 or  # 9, 19, 37, 43
            val.lower().find('[[[{dat}data types]]]') > -1 or 
            val.lower().find('[[[{dat}datapoint]]]') > -1 or 
            val.lower().find('[[[{pra}collected]]]') > -1 or 
            val.lower().find('[[[{pur}purpose]]]') > -1 or
            val.lower().find('[[[{pur}data purpose]]]') > -1 or
            val.lower().find('[[[{pur}data collection purpose]]]') > -1  # 44
           ):
            data_header_found = True
            break
    if (data_header_found):
        for i in range(body_offset, len(data_frame)):
            for j in range(len(data_frame.columns)):
                if (data_frame[data_frame.columns[j]][i] == '' or
                    isinstance(data_frame[data_frame.columns[j]][i], float)
                   ):
                    continue
                results += f'{header[j]}, {data_frame[data_frame.columns[j]][i]}. '
            results = results.rstrip(' ').rstrip(',').rstrip('.')+'.\n'
    else:
        print(f'Header not detected: {header = }')
        for val in header:
            results += f'{val} '
        results = results.rstrip(' ').rstrip('.')+'.\n'
        for i in range(body_offset, len(data_frame)):
            for j in range(len(data_frame.columns)):
                results += f'{data_frame[data_frame.columns[j]][i]} '
            results = results.rstrip(' ').rstrip(',').rstrip('.')+'.\n'

    return results

In [5]:
def load_data_frames_from_html(target):
    html_file_path = f'{guidance_dir}{target}.html'

    if (not os.path.isfile(html_file_path)):
        print(f'HTML source not found for {target = }')
        return

    html_content = open(html_file_path, 'r').read()
    print(f'{len(html_content) = }')
    soup = BeautifulSoup(html_content, 'html.parser')
    for th in soup.find_all('th'):
        if (th.parent.name == 'div'):
            th.parent.unwrap()

    for th in soup.find_all('td'):
        for li in th.find_all('li'):
            li.string = li.text.rstrip() + '.'
            # print(li.string)
    try:
        data_frames = pandas.read_html(str(soup))
    except ValueError:
        print('No tables found')
        return

    return data_frames

In [6]:
def analyze(guidance):
    results = ''

    data_frames = load_data_frames_from_html(guidance)

    if (data_frames):
        print(f'#tables = {len(data_frames)}')
    else:
        return

    for data_frame in data_frames:
        # print(data_frame)
        results += convert_table_to_text(data_frame)

    with open(f'{output_dir_path}{guidance}_tab.txt', 'w') as f:
        f.write(results)

In [8]:
for target in range(1, NUM + 1):
    print(f'{target = }')
    analyze(target)

print('Done')

target = 1
len(html_content) = 15109
No tables found
target = 2
len(html_content) = 81960
#tables = 14
target = 3
len(html_content) = 22284
No tables found
target = 4
len(html_content) = 253256
#tables = 2
Header not detected: header = ['Question', 'Developer Response']
target = 5
len(html_content) = 213377
#tables = 15
Header not detected: header = ['Does your app collect or share any of the required user data types?', 'Yes']
target = 6
len(html_content) = 127070
#tables = 1
target = 7
len(html_content) = 94173
#tables = 1
target = 8
len(html_content) = 90527
#tables = 1
target = 9
len(html_content) = 56877
#tables = 1
target = 10
len(html_content) = 256010
#tables = 2
target = 11
len(html_content) = 131390
#tables = 2
Header not detected: header = ['Action', 'Details']
target = 12
len(html_content) = 65625
#tables = 1
target = 13
len(html_content) = 407279
#tables = 13
Header not detected: header = ['Privacy Manager Flag', 'Features']
Header not detected: header = ['Data', 'Descripti