In [3]:
from operator import itemgetter
import fitz
import json

In [4]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [5]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [6]:
def headers_para(doc, name):
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    count = 0
    currSize = -1
    nums = [13, 14, 15]
    parNum = 0
    totalText = ""
    
    for i in range(len(doc)):
    # for i in nums:
        print(f"ENTERING PAGE {i}")
        page = doc[i]
        block = page.get_text("dict")
        blocks = block['blocks']
        parNum = 0
        for b in blocks:
            if b['type'] == 0:
                for line in b['lines']:
                    for span in line['spans']:
                        if currSize == span['size']:
                            totalText += " " + span['text'] + "\n"
                        else:
                            if len(totalText) in [0, 1]:
                                currSize = span['size']
                                totalText = span['text']
                                continue

                            with open(f"C:/Users/DELL/Desktop/Asha/text-as-page/{name}_page{i+1}_para{parNum+1}.txt", 'w', encoding='utf-8') as text_file:
                                # print(text_file)
                                text_file.write(str(currSize))
                                text_file.write(totalText)
                            parNum +=1
                            currSize = span['size']
                            totalText = span['text']

    return header_para

In [7]:
name = "book-no-4"

document = f'C:/Users/DELL/Desktop/Asha/pdfs/{name}.pdf'

# document = "C:/Users/DELL/Desktop/Sem 5 Text Books/DS Elective/Distributed Systems.pdf"
# name = "DS"

doc = fitz.open(document)

font_counts, styles = fonts(doc, granularity=False)

# size_tag = font_tags(font_counts, styles)

elements = headers_para(doc, name)
print(elements)

ENTERING PAGE 0
ENTERING PAGE 1
ENTERING PAGE 2
ENTERING PAGE 3
ENTERING PAGE 4
ENTERING PAGE 5
ENTERING PAGE 6
ENTERING PAGE 7
ENTERING PAGE 8
ENTERING PAGE 9
ENTERING PAGE 10
ENTERING PAGE 11
ENTERING PAGE 12
ENTERING PAGE 13
ENTERING PAGE 14
ENTERING PAGE 15
ENTERING PAGE 16
ENTERING PAGE 17
ENTERING PAGE 18
ENTERING PAGE 19
ENTERING PAGE 20
ENTERING PAGE 21
ENTERING PAGE 22
ENTERING PAGE 23
ENTERING PAGE 24
ENTERING PAGE 25
ENTERING PAGE 26
ENTERING PAGE 27
ENTERING PAGE 28
ENTERING PAGE 29
ENTERING PAGE 30
ENTERING PAGE 31
ENTERING PAGE 32
[]


In [8]:
# with open("C:/Users/DELL/Desktop/Asha/text-as-page/test.txt", 'w') as json_out:
#     json.dump(elements, json_out)

In [9]:
# size_tag

In [1]:
# # # # # Clear the folder
# import os

# folder_path = "C:/Users/DELL/Desktop/Asha/text-as-page"  # Replace with the path to your folder

# # Iterate over the files in the folder and delete them
# for filename in os.listdir(folder_path):
#     file_path = os.path.join(folder_path, filename)
#     try:
#         if os.path.isfile(file_path):
#             os.remove(file_path)
#     except Exception as e:
#         print(f"Error deleting {file_path}: {e}")

# print("All files in the folder have been deleted.")

All files in the folder have been deleted.
