%%capture
!pip install --upgrade pymupdf

In [4]:
import requests
import pymupdf
from pprint import pprint
from collections import deque

In [15]:
doc_url = "https://dl.mitsubishielectric.com/dl/fa/document/manual/robot/bfp-a3447/bfp-a3447q.pdf"
file_path = "data/manual.pdf"
with open(file_path, mode="wb+") as f:
    doc_file = requests.get(doc_url)
    f.write(doc_file.content)

In [17]:
doc = pymupdf.open(file_path)
# doc = pymupdf.open("data/bfp-a3447q.pdf")
# toc = doc.get_toc(simple= False)  # list of objects [level, title, page_number, dest]

In [19]:
for index, drawing in enumerate(doc[28].cluster_drawings()):
    doc[28].get_pixmap(matrix=pymupdf.Matrix(2,2), clip= drawing).save(f'graphics_{index}.jpg')

In [16]:
doc[28].get_pixmap(matrix=pymupdf.Matrix(2,2), clip= doc[28].cluster_drawings()[0]).save('test.jpg')

In [6]:
doc[28].get_images()

[]

In [8]:
def build_hierarchy(flat_data):
    root = []
    stack = deque()  # stack to track current parent chain

    for item in flat_data:
        node = {
            "level": item[0],
            "name": item[1],
            "page": item[3]['page'],
            "dest": item[3],
            "text": item[1],           # placeholder for future content
            "children": []
        }

        # If stack is empty or current node is top-level
        if not stack:
            root.append(node)
            stack.append(node)
        else:
            # Pop until we find a parent node with lower level
            while stack and stack[-1]["level"] >= node["level"]:
                stack.pop()

            if stack:
                stack[-1]["children"].append(node)
            else:
                root.append(node)

            stack.append(node)

    return root


In [None]:
# Pretty print the result
nested = build_hierarchy(toc)
pprint(nested, sort_dicts=False, width=120)

In [9]:
def get_full_text(chapter):
    texts = [chapter["text"]]
    for child in chapter.get("children", []):
        texts.append(get_full_text(child))
    return "\n".join(texts)


In [10]:
get_full_text(nested[1]['children'][0])

'2.1 Standard specifications\n2.1.1 Basic specifications\n(1) RH-3CH-Sxx\n(2) RH-6CH-Sxx\n2.1.2 The counter-force applied to the installation surface'

In [16]:
doc.xref_length()

8428

In [23]:
doc.xref_get_keys(8294)

['Count', 'Dest', 'First', 'Last', 'Next', 'Parent', 'Title']

In [24]:
doc.xref_get_key(8294 , key)(8308)

('string', 'G7.3513738')

In [8]:
page = doc[20]
page.get_textpage().

'                               2Robot arm\n   Standard specifications   2-8\n2 Robot arm\n2.1 Standard specifications\n2.1.1 Basic specifications\n(1) RH-3CH-Sxx\nTable 2-1 ：Standard specifications of robot arm\nType\nRH-3CH4018-S11/S15\nRH-3CH4018-S23/S24\nRH-3CH4018-S51/S52\nEnvironment\nStandard specification\nInstallation posture\nOn floor\nDegree of freedom\n4\nStructure\nHorizontal, multiple-joint type\nDrive system \nAC servo motor\nPosition detection method\nAbsolute encoder\nMotor capacity\nJ1\nＷ\n200\nJ2\nＷ\n100\nJ3 (Z)\nＷ\n100\nJ4 (θ)\nＷ\n100\nBrake\nJ1, J2, J4 axes: no brake\nJ3 axis: with brake\nJ1, J2 axes: no brake\nJ3, J4 axes: with brake\nArm length\n№1 arm\nmm\n225\n№2 arm\nmm\n175\nReach radius (№ 1+ № 2)\nmm\n400\nOperating range\nJ1\ndeg\n264(±132)\nJ2\ndeg\n282(±141)\nJ3 (Z)\nmm\n180\n130\n180\nJ4 (θ)\ndeg\n720(±360)\nSpeed of motion Note 1) \nNote 1) The maximum speed is the value which applied MvTune2 (high-speed movement mode).\nIn addition, it is the value du

In [14]:
[ link.dest for link in page.links() if link['page']==13]

AttributeError: 'dict' object has no attribute 'dest'

In [15]:
link = next(page.links())

In [4]:
blocks = doc[15].get_text("dict")["blocks"]
for b in blocks:
    y0 = b["bbox"][1]
    if 50 < y0 < 775:  # Ignore header/footer
    # print(f'\n{y0}:')
        for bb in b['lines']:
            # print(f"{bb['spans'][0]['origin']}:\t {bb['spans'][0]['text']}")
            print(f"{bb['spans'][0]['bbox']}\t {bb['spans'][0]['text'][:30]}")
            # pprint(bb['spans'][0])

(53.459999084472656, 75.93843841552734, 297.8989562988281, 86.91844177246094)	 1.2.2 Combination of the robot
(67.62000274658203, 87.97547149658203, 285.6304931640625, 97.93547058105469)	 Table 1-1 ：Combination of robo
(53.459999084472656, 229.80137634277344, 200.15518188476562, 241.80137634277344)	 1.3 CE marking specifications
(67.62000274658203, 246.91546630859375, 151.07086181640625, 256.8754577636719)	 The robot shown in 
(67.6200180053711, 272.35528564453125, 305.8254089355469, 282.3152770996094)	 Table 1-2 ：Robot models with C
(53.459999084472656, 414.1213684082031, 144.9456024169922, 426.1213684082031)	 1.4 Indirect export
(67.62000274658203, 431.2955017089844, 362.8234558105469, 441.2554931640625)	 The display in English is avai
(53.459999084472656, 467.8213806152344, 169.16641235351562, 479.8213806152344)	 1.5 Instruction manuals
(67.62000274658203, 484.9353942871094, 533.0260009765625, 494.8953857421875)	 The instruction manuals suppli
(67.61898803710938, 496.9352111816406, 