In [1]:
import fitz
import re
import pandas as pd
import fitz
import pdfplumber
import xml.etree.ElementTree as ET


def split_lines_and_extract_bboxes(page, text):
    search_lines = text.split('\n')

    for line in search_lines:
        search_result = page.search(re.escape(line.strip()))
        if search_result:
            for bbox in search_result:
                yield line, bbox


def parse_xml_pdf(xml_file, pdf):
    results = []
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        keys_to_extract = ['x0', 'top', 'x1', 'bottom']

        for element in root.iter():
            details = {}
            pg_num = element.attrib.get('pNum', None)
            tag = element.tag
            text = element.text.strip() if element.text else ""
            
            if pg_num and pg_num != '' and text != '':
                page = pdf.pages[int(pg_num) - 1]

                try:
                    line_bbox_dict = {} 

                    line_bbox_generator = split_lines_and_extract_bboxes(page, text)

                    for line, bbox in line_bbox_generator:
                        text_rect = [bbox[key] for key in keys_to_extract]

                        result_dict = {
                            'page': pg_num,
                            'tag': tag,
                            'text': line,
                            'bbox': (text_rect[0], text_rect[1], text_rect[2], text_rect[3]),
                            'width':page.width,
                            'height':page.height
                        }

                        key = (tag, line)
                        if key in line_bbox_dict:
                            line_bbox_dict[key]['bbox'].append(result_dict['bbox'])
                        else:
                            result_dict['bbox'] = [result_dict['bbox']]
                            line_bbox_dict[key] = result_dict

                    results.extend(line_bbox_dict.values())
                except Exception as e:
                    print(f'Error: {e}')
    except Exception as e:
        print("--->",xml_file)

    return results


def remove_dup_bboxes(results):
    for dup_item in results:
        for res in results:
            if (len(dup_item['bbox']) > 1) and (dup_item['page'] ==  res['page']) and (dup_item['text'] in res['text']) and (dup_item['tag'] != res['tag']):
                    for (ox1, oy1, ox2, oy2) in res['bbox']:
                        for (dx1, dy1, dx2, dy2) in dup_item['bbox']:
                           if (ox1 <= dx1 and oy1 <= dy2) and (ox2 >= dx2 and oy2 >= dy2):                            
                               dup_item['bbox'].remove((dx1, dy1, dx2, dy2))
    return results
    


def remove_dup_text(cleaned_bboxes):
    seen_texts = set()
    unique_data = []
    
    for item in cleaned_bboxes:
        text = item['text']
        bbox = item['bbox']
    
        # Check if text and bbox are the same as a previous occurrence
        if (text, tuple(bbox)) not in seen_texts:
            seen_texts.add((text, tuple(bbox)))
            unique_data.append(item)
    return unique_data


def get_figure_boxes(xml_file, pdf):
    figure_list = []
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        for element in root.iter("Figure"):
            figure_info = {}
            
            page_num = element.get('pNum')
            page = pdf.pages[int(page_num) - 1]
            
            width = page.width
            height = page.height
            
            bbox = element.get('bBox')
            try:
                string_values = bbox.strip('[]').split()
                bbox = tuple(float(value) for value in string_values)
                text = element.find('alt').text if element.find('alt') is not None else None
                
                figure_info['page'] = page_num
                figure_info['tag'] = "Figure"
                figure_info['text'] = text
                figure_info['bbox'] = [bbox]
                figure_info['width'] = page.width
                figure_info['height'] = page.height

                figure_list.append(figure_info)
                
            except Exception as e:
                    print(f'Error: {e}')                               
    except Exception as e:
                print(f'fig--->Error: {xml_file}')    
    return figure_list

#xml figure coords and pdf figure coords are different..so converting them
def convert_figure_coordinates_to_pdf(itext7_coordinates, pdf_height):
    fitz_coordinates = []
    for coord in itext7_coordinates:
        x1, y1, x2, y2 = coord['bbox'][0]
        flipped_coordinates = (x1, pdf_height - y2, x2, pdf_height - y1)
        fitz_coord = {
            'page': coord['page'],
            'tag': coord['tag'],
            'text': coord['text'],
            'bbox': [flipped_coordinates],
            'width':coord['width'],
            'height':coord['height']
        }
        fitz_coordinates.append(fitz_coord)
    return fitz_coordinates


def process_page(pdf_path, fig_results, text_unique_coordinates):
    doc = fitz.open(pdf_path)
    
    for page in doc.pages():
    
        pagenum = str(page.number + 1)
        height = page.mediabox.height
    
        fitz_figure_coordinates = convert_figure_coordinates_to_pdf(fig_results, height)
        all_coordinates = fitz_figure_coordinates + text_unique_coordinates

    return all_coordinates

#starts here
def get_pdf_coords_for_xml_tags(xml_file_path, pdf,  pdf_path):
    results = parse_xml_pdf(xml_file_path, pdf)
    cleaned_bboxes = remove_dup_bboxes(results)
    unique_data = remove_dup_text(cleaned_bboxes)
    print("len of unique data:", len(unique_data))
    fig_results = get_figure_boxes(xml_file_path, pdf)   
    all_coordinates = process_page(pdf_path, fig_results, unique_data)

    return all_coordinates




In [18]:
import os
import shutil
import fitz
import pdfplumber
from pdf2image import convert_from_path
import json
from copy import deepcopy
from collections import defaultdict
# from map_xml_tags_pdf_coords import get_pdf_coords_for_xml_tags

#convert pdf coords to image coords, as coco format expects that
def convert_pdf_to_image_cords(img_height, img_width, pdf_height, pdf_width, img_cc):

    height_ratio = img_height/ pdf_height
    width_ratio = img_width/pdf_width

    new_cords = [img_cc[0]*width_ratio, img_cc[1]*height_ratio, img_cc[2]*width_ratio, img_cc[3]*height_ratio]
    new_cords_result = [new_cords[0], new_cords[1], new_cords[2]-new_cords[0], new_cords[3]-new_cords[1]]
    
    return new_cords_result

def updated_json_results(pdf_path, pdf, json_data, saved_images_path):
    new_json_update = []
    
    filename = os.path.basename(pdf_path)
    images = convert_from_path(pdf_path,size=(1025,1025))
    for i, image in enumerate(images):
        file = '{}_{}.png'.format(filename.split('.')[0], i)
        image.save(os.path.join(saved_images_path, file))

    for data in json_data[:]:
        for i,page in enumerate(pdf.pages):
            if str(page.page_number) == data['page']:
                for bbox in data['bbox']:
                    new_bbox = convert_pdf_to_image_cords(images[i].height,images[i].width,data['height'],data['width'],bbox)

                    updates = {
                        'page': data['page'],
                        'tag': data['tag'],
                        'text': data['text'],
                        'bbox': new_bbox,
                        'segmentation':[[ new_bbox[0], new_bbox[1], new_bbox[0],
                                        sum([new_bbox[2], new_bbox[3]]),  # Wrap the values in a list for sum
                                        sum([new_bbox[0], new_bbox[2]]),  # Wrap the values in a list for sum
                                        sum([new_bbox[2], new_bbox[3]]),  # Wrap the values in a list for sum
                                        sum([new_bbox[0], new_bbox[2]]),  # Wrap the values in a list for sum
                                        new_bbox[1]
                                        ]],
                        'area': new_bbox[2] * new_bbox[3],
                        'width': images[i].width,
                        'height': images[i].height,
                        'file_name': '{}/{}_{}.png'.format(saved_images_path, filename.split('.')[0], i),
                        "collection": 'apex_repors',
                        'doc_name':filename,
                        "precedence": 0,
                        'doc_category':'reports'
                        }
                    new_json_update.append(updates)

    return new_json_update

#convert to final coco_format for training
def convert_to_coco_format(input_json):
    category_ids = {}
    category_names = {}
    
    coco = {
        'categories':[],
        'images':[],
        'annotaions':[]
    }
    
    count = 1
    ann_count = 1
    for data in input_json:
        for i,images in enumerate(data):
            
            # Categories creation
            tag_name = images["tag"]
            if tag_name not in category_ids:
                category_id = len(category_ids) + 1
                category_ids[tag_name] = category_id
                category_names[tag_name] = f"{tag_name}"

                # Update categories in Coco format
                coco_category = {
                    "supercategory": tag_name,
                    "id": category_id,
                    "name": category_names[tag_name]
                }
                coco["categories"].append(coco_category)
            
            
            # Images creation
            file_name = images['file_name']
            width = images['width']
            height = images['height']
            doc_name = images['doc_name'],
            collection = images["collection"],
            page_no = int(images["page"]),
            precedence = images["precedence"],
            
            if all(entry['file_name'] != file_name for entry in coco['images']):
                
                coco['images'].append({
                    'id' :count,
                    'width': width,
                    'height': height,
                    'file_name': file_name,
                    'doc_name': doc_name[0],
                    'collection':collection[0],
                    'page_no':page_no[0],
                    'precedence':precedence[0]
                })
                
                count+=1

                
            # Annotations section
            image_id    = count-1
            idx         = ann_count
            bbox        = images['bbox']
            segmentation = images['segmentation']
            area = images['area']
            category_id= category_ids[tag_name]
            precedence = images['precedence']
            
            coco['annotaions'].append({
                            "id":idx,
                            "image_id" : image_id,
                            "category_id":category_id,
                            "bbox":bbox,
                            "segmentation":segmentation,
                            "area" :area,
                            "file_name":file_name,
                            "precedence":precedence
               
            })

            ann_count +=1     
            
            
    return coco,
# category_ids,category_names

#for testing on single document
def single_file_main(saved_images_path):

    pdf_path = '/home/roufa/pdf_accessibity_notebook/Top_50_pdfs/output/Doc_4.pdf'
    xml_file_path = '/home/roufa/pdf_accessibity_notebook/Top_50_pdfs/output/Doc_3.xml'
    
    with pdfplumber.open(pdf_path) as pdf:
        all_coordinates = get_pdf_coords_for_xml_tags(xml_file_path, pdf, pdf_path)
    
    pdf = pdfplumber.open(pdf_path)
    
    updated_json_data = updated_json_results(pdf_path, pdf, all_coordinates, saved_images_path)
    output_coco_format = convert_to_coco_format(updated_json_data)

    return output_coco_format


def get_coco_format_data(input_data_path, saved_images_path):
    final_comp_json = []

    for i, file in enumerate(sorted(os.listdir(input_data_path))): 
        if file.endswith(".pdf"):

            pdf_path = os.path.join(input_data_path, file)
            xml_file_path = os.path.join(input_data_path, file.replace(".pdf", ".xml"))
            print()
            print("pdf:", pdf_path)
            print("xml:", xml_file_path)

            with pdfplumber.open(pdf_path) as pdf:
                all_coordinates = get_pdf_coords_for_xml_tags(xml_file_path, pdf, pdf_path)

            if all_coordinates:
                updated_json_data = updated_json_results(pdf_path, pdf, all_coordinates, saved_images_path)
                if updated_json_data:
                    final_comp_json.append(updated_json_data)

    coco_format_data = convert_to_coco_format(final_comp_json)
    return coco_format_data


#starts here
# input_data_path
def main(input_data_path):

    cwd = os.getcwd()
    saved_images_path = '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/images_folder'
    # os.path.join(cwd, 'Images')

    if os.path.exists(saved_images_path):
        shutil.rmtree(saved_images_path)

    os.makedirs(saved_images_path, exist_ok=True)

    # coco_format_data = single_file_main(saved_images_path)
    coco_format_data = get_coco_format_data(input_data_path, saved_images_path)

    return coco_format_data


input_data_path = '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/output_xml'
coco_format_data = main(input_data_path)
# coco_format_data = main()


output_json_file = '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/json_file/final_coco.json'

# Write the JSON data to the file
with open(output_json_file, 'w') as json_file:
    json.dump(coco_format_data, json_file, indent=2)


pdf: /home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/output_xml/Doc_1.pdf
xml: /home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/output_xml/Doc_1.xml
len of unique data: 514
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: could not convert string to float: 'R'
Error: coul

In [21]:
data

[{'categories': [{'supercategory': 'P', 'id': 1, 'name': 'P'},
   {'supercategory': 'H2', 'id': 2, 'name': 'H2'},
   {'supercategory': 'H3', 'id': 3, 'name': 'H3'},
   {'supercategory': 'TH', 'id': 4, 'name': 'TH'},
   {'supercategory': 'TD', 'id': 5, 'name': 'TD'},
   {'supercategory': 'Lbl', 'id': 6, 'name': 'Lbl'},
   {'supercategory': 'LBody', 'id': 7, 'name': 'LBody'},
   {'supercategory': 'Span', 'id': 8, 'name': 'Span'},
   {'supercategory': 'H1', 'id': 9, 'name': 'H1'},
   {'supercategory': 'H4', 'id': 10, 'name': 'H4'}],
  'images': [{'id': 1,
    'width': 1025,
    'height': 1025,
    'file_name': '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/images_folder/Doc_1_0.png',
    'doc_name': 'Doc_1.pdf',
    'collection': 'apex_repors',
    'page_no': 1,
    'precedence': 0},
   {'id': 2,
    'width': 1025,
    'height': 1025,
    'file_name': '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/images_folder/Doc_1_

In [31]:
import json
json_path = '/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/json_file/final_coco.json'
f = open(json_path)
data = json.load(f)

# print(data['images'])


files = list(set([item["id"] for item_1 in data for item in item_1['images']] ))
print(files)
files_len = len(files)
percentage = 0.8
train_files_count = int(files_len* percentage)
trian_files = files[:train_files_count]
test_files = files[train_files_count:]

train_images = [content_dict for item_1 in data for content_dict in item_1['images'] for match in trian_files if content_dict['id'] == match]
# train_images

train_images_annotations = [content_dict for item_1 in data  for content_dict in item_1['annotaions'] for match in trian_files if content_dict['image_id'] == match]

test_images = [content_dict for item_1 in data for content_dict in item_1['images'] for match in test_files if content_dict['id'] == match]


test_images_annotations = [content_dict for item_1 in data for content_dict in item_1['annotaions'] for match in test_files if content_dict['image_id'] == match]


categories = [item_1 ['categories'] for item_1 in data]

train_coco = {
    'categories': categories,
    'images':train_images,
    'annotaions':train_images_annotations
}

test_coco = {
    'categories': categories,
    'images':test_images,
    'annotaions':test_images_annotations
}


with open('/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/json_file/check_me_train.json','w') as f:
    json.dump(train_coco,f,indent=4)
    
    
with open('/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/json_file/check_me_test.json','w') as f:
    json.dump(test_coco,f,indent=4)

# train_images_annotations
# for content_dict in data:
    
#     print(data[content_dict])


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]


In [32]:
!pwd

/home/roufa/pdf_accessibity_notebook/top_30_pdfs/final_test_Data/pdf_Accessiblity/preprocessing_notebooks
