# Colab init

In [1]:
from google.colab import drive
drive.mount('/content/drive')
folder_path = '/content/drive/My Drive/Colab Notebooks/IR_Project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pdfplumber



# Test

In [74]:
from collections import Counter

def font_beautify(font_sizes):
    # Create a Counter to count occurrences of each font size
    size_counter = Counter(font_sizes)

    # Find the most common font size (body font size)
    body_font_size = size_counter.most_common(1)[0][0]

    # Remove the body font size from the set of unique font sizes
    font_sizes_set = set(font_sizes)
    font_sizes_set.discard(body_font_size)

    # Sort the remaining font sizes in descending order
    sorted_sizes = sorted(font_sizes_set, reverse=True)

    # Create a dictionary mapping font sizes to their corresponding header names
    fonts = {size: f"Header {index + 1}" for index, size in enumerate(sorted_sizes)}

    # Assign the body font size to "Body"
    fonts[body_font_size] = "Body"

    return fonts


In [75]:

import pdfplumber

def extract_paragraphs(pdf_path):
    paragraphs = []
    paragraph = ""
    font_sizes = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for block in page.extract_text_lines():
                chars = block['chars']
                font_size = round(chars[0]['size'], 1)
                font_type = chars[0]['fontname']
                pos = chars[0]['top']
                content = str(block['text'])
                if not content.isdigit():
                  font_sizes.append(font_size)
                  paragraphs.append({
                    'text': content,
                    'size': font_size,
                    'font': font_type,
                    'position': pos
                })




    return paragraphs , font_sizes

pdf_path = folder_path +'sample.pdf'
paragraphs ,font_sizes = extract_paragraphs(pdf_path)
fonts = font_beautify(font_sizes)
print("Unique font sizes:", fonts)
# for paragraph in paragraphs:
#     print(f"Text: {paragraph['text']}, Size: {paragraph['size']}, Font: {paragraph['font']}")

Unique font sizes: {36.0: 'Header 1', 18.0: 'Header 2', 12.0: 'Body'}


In [76]:
import json
from collections import defaultdict


def group_by_size_recursively(paragraphs, current_size, fonts, labels):
    result = []
    lines = []
    current_obj = None
    while paragraphs:
        paragraph = paragraphs.pop(0)
        if paragraph["position"] < 50:
          continue

        # If the next paragraph has a larger font size, return to the previous level
        if paragraph["size"] > current_size:
            result.append(lines)
            paragraphs.insert(0, paragraph)  # put it back for the caller level
            return result
        if fonts[paragraph["size"]] != "Body":
          if current_obj is None:
                current_obj = {
                    "content": fonts[paragraph["size"]],
                    "text": paragraph["text"],
                    "size": paragraph["size"],
                    "position": paragraph["position"]
                }
          elif fonts[paragraph["size"]] == fonts[current_obj["size"]]:
              current_obj["text"] += " " + paragraph["text"]
          else:
            if not any(label["text"] == current_obj["text"] for label in labels):
               labels.append(current_obj)
            current_obj = None

        if paragraph["size"] == current_size:
          if fonts[paragraph["size"]] == "Body":
            lines.append(paragraph["text"])
        else:
            if current_obj is not None:
              labels.append(current_obj)
              current_obj = None
            paragraphs.insert(0, paragraph)
            sub_layer = group_by_size_recursively(paragraphs, paragraph["size"], fonts, labels)


            if labels:
              label = labels.pop()
              label_text = label["text"]
            else:
              label_text =  "subLayer"


            result.append({
                label_text : sub_layer
            })

    result.append(lines)
    return result

labels = []
layered_structure = group_by_size_recursively(paragraphs, paragraphs[0]["size"] , fonts, labels)

json_output = json.dumps(layered_structure, indent=4)
print(json_output)


[
    {
        "Sample PDF": [
            {
                "This is a simple PDF file. Fun fun fun.": [
                    [
                        "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi.",
                        "Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget",
                        "pharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit.",
                        "Integer a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus.",
                        "Vestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla",
                        "erat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus.",
                        "Vivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique",
                        "accumsan eros quam et risus. Suspendisse li