In [2]:
from operator import itemgetter
import fitz
import json

In [3]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [4]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [5]:
def headers_para(doc, size_tag, pageCount):
    """Scrapes headers & paragraphs from PDF and save each paragraph in a file with headers in the next file.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :param pageCount: the maximum number of pages to process
    :type pageCount: int
    """
    count = 0  # Paragraph number
    header_text = ""
    
    for page_num, page in enumerate(doc, start=1):
        if count >= pageCount:
            break
        count+=1
        blocks = page.get_text("dict")["blocks"]
        paraCount = 1
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                # REMEMBER: multiple fonts and sizes are possible IN one block
                block_string = ""  # text found in block
                
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces
                            if not header_text:
                                header_text = size_tag[s['size']] + s['text']
                            else:
                                paraCount += 1
                                filename = f"C:/Users/DELL/Desktop/Asha/text-as-page/page_{page_num}_para_{paraCount}.txt"
                                with open(filename, 'w', encoding='utf-8') as text_file:
                                    text_file.write(header_text + "\n" + size_tag[s['size']] + s['text'])
                                header_text = ""
    
    return

# Example usage:
# Replace 'doc' and 'size_tag' with your PDF document and size tag dictionary
# Replace 'pageCount' with the desired maximum number of pages to process
# headers_para(doc, size_tag, pageCount)


In [6]:

document = 'C:/Users/DELL/Desktop/Asha/pdfs/book-no-1.pdf'
doc = fitz.open(document)
page = doc[0]

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags(font_counts, styles)

elements = headers_para(doc, size_tag, 65)

In [7]:
# with open("C:/Users/DELL/Desktop/Asha/text-as-page/test.txt", 'w') as json_out:
#     json.dump(elements, json_out)

In [8]:
size_tag

{63.321998596191406: '<h1>',
 40.0: '<h2>',
 36.0: '<h3>',
 32.0: '<h4>',
 23.95800018310547: '<h5>',
 22.0: '<h6>',
 13.0: '<h7>',
 12.0: '<h8>',
 11.0: '<p>',
 10.0: '<s1>',
 9.0: '<s2>',
 8.0: '<s3>',
 7.155417442321777: '<s4>',
 6.0: '<s5>',
 3.6989998817443848: '<s6>'}

In [10]:
# # Clear the folder
import os

folder_path = "C:/Users/DELL/Desktop/Asha/text-as-page"  # Replace with the path to your folder

# Iterate over the files in the folder and delete them
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        if os.path.isfile(file_path):
            os.remove(file_path)
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")

print("All files in the folder have been deleted.")

All files in the folder have been deleted.
