In [66]:
from operator import itemgetter
import fitz
import json
import xlwt

def text_sizes_count(doc):
    """Extracts text sizes and their count in PDF documents.

    :param doc <class 'fitz.fitz.Document'>: PDF document to iterate through

    :rtype: [(text_size, count), (text_size, count}]
    """
    size_counts = {}

    for page in doc:
        # Refer - https://pymupdf.readthedocs.io/en/latest/app1.html#dict-or-json
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if block['type'] == 0:
                for line in block["lines"]:
                    for span in line["spans"]:
                        identifier = span['size']
                        size_counts[identifier] = size_counts.get(identifier, 0) + 1

    size_counts = sorted(size_counts.items(), key=itemgetter(1), reverse=True)
    if len(size_counts) < 1:
        raise ValueError("Entire PDF consists only of paragraphs!!")
    
    print("font_count- {0}: ".format(size_counts));

    return size_counts


def text_tags(size_counts):
    """Returns dictionary with text sizes as keys and tags as value.

    :param size_counts<list>: (font_size, count) for all fonts occuring in document

    :rtype: dict
    """
    para_size = float(size_counts[0][0])  # extract the paragraph's size

    # sort in descending order for assignment of level of heading
    size_counts.sort(key=itemgetter(1))

    idx = 0
    size_tag = {}

    for size_count in size_counts:
        size = size_count[0]
        idx += 1
        if size == para_size:
            idx = 0
            size_tag[size] = 'p'
        if size > para_size: # if size > para , it should be a heading
            size_tag[size] = 'h{0}'.format(idx)

    print("size tag - {0}: ".format(size_tag));
    return size_tag


def headers_para(doc, size_tags):
    """Splits headers & paragraphs from PDF

    :param doc <class 'fitz.fitz.Document'>: PDF document
    :param size_tag<dict>: element tags and size mapping

    :rtype: list
    :return: header and para array
    """

    header_para = {}
    first = True
    previous_span = {}

    idx = 0
    currentGroupIdx = "{0}".format(idx)

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            block_text = ""
            if b['type'] == 0:  # this block contains text
                for l in b["lines"]:
                    for s in l["spans"]:
                        if s['text'].strip(): # remove whitespaces

                            # previous values
                            previous_is_header = False
                            previous_size_tag = ""
                            if (not first):
                                previous_size_tag = size_tags[previous_span['text']['size']]
                                previous_is_header = previous_span['is_header']
                            first = False

                            # current size and text values
                            size_tag = size_tags[s['size']]
                            block_text = s['text']
                            # updating to previous span as current span
                            previous_span = {'text': s, 'is_header': False}

                            if previous_size_tag == size_tag: # append to previous
                                if previous_is_header:
                                    header_para[currentGroupIdx]['header'] += block_text
                                    previous_span['is_header'] = True
                                else:
                                    header_para[currentGroupIdx]['para'] += block_text
                            else:
                                if size_tag != 'p': # if current is header

                                    # create new pair(header,para)
                                    idx = idx + 1
                                    currentGroupIdx = "{0}".format(idx)
                                    header_para[currentGroupIdx] = {}

                                    header_para[currentGroupIdx]['header'] = block_text
                                    previous_span['is_header'] = True
                                else:
                                    header_para[currentGroupIdx]['para'] = block_text # add as paragraph
    return header_para

def main():

    document = 'sampleText.pdf'
    doc = fitz.open(document)

    size_counts = text_sizes_count(doc)

    size_tag = text_tags(size_counts)

    header_para_groups = headers_para(doc, size_tag)
    
    # saving to xls workbook 
    workbook = xlwt.Workbook() 
    sheet = workbook.add_sheet("Sheet Name") 

    row = 0
    for key in header_para_groups:
        # adding style to wrap text in the cell
        style = xlwt.XFStyle()
        style.alignment.wrap = 1
        if 'header' in header_para_groups[key]:
            sheet.write(row, 0, header_para_groups[key]['header'], style) 
        if 'para' in header_para_groups[key]:
            sheet.write(row, 1, header_para_groups[key]['para'], style) 
        row += 1

    workbook.save("headerParasMapping.xls") 
    print("done")

if __name__ == '__main__':
    main()

font_count- [(10.5, 80), (11.5, 6), (18.0, 4), (52.5, 1)]: 
size tag - {52.5: 'h1', 18.0: 'h2', 11.5: 'h3', 10.5: 'p'}: 
done
