%%capture
!pip install pymupdf4llm

In [31]:
import pymupdf, pymupdf4llm
from pprint import pprint
import pathlib
from collections import deque

In [None]:
filename = "data/bfp-a3447q.pdf"
image_path = "data/images"
content_first_page = 13 # 0-based page number
margins=(50,75)
md = ""

doc = pymupdf.open(filename)  # use a Document for subsequent processing
my_headers = pymupdf4llm.TocHeaders(doc)  # use the table of contents for determining headers

In [3]:
for page in doc[content_first_page : doc.page_count]:
    print(f"\rProcessing page no {page.number+1}...", end='')
    clusters = page.cluster_drawings()
    for bb in clusters:
        page.draw_rect(bb, width=0.2)  # put extra border around detected graphics
    md += pymupdf4llm.to_markdown(
        doc,
        pages=[page.number],
        margins=margins,
        hdr_info=my_headers,
        write_images=True,
        image_path = image_path,
        force_text=False
    )

Processing page no 139...

In [4]:
len(md)

232255

In [6]:
import regex as re
pattern = r"""[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|;:'",.<>/?\\`~ \t\n□△◇：±℃φ×Ω（）]"""
anti_pattern = r"""[a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|;:'",.<>/?\\`~ \t\n□△◇：±℃φ×Ω（）]"""
notes_pattern = r'Note (?=\d\))'
cleaned = re.sub(pattern, "", md)
cleaned = re.sub(notes_pattern, " *Note ", cleaned)
chaos = re.sub(anti_pattern, "", md)
splitted = re.split(r'\n(?=#)',cleaned)
pathlib.Path("output.md").write_bytes(cleaned.encode())

230370

In [7]:
#Manual correction for the missing chapters
chapter_tuples = [
    ("(1) Machine cable （Fixed type）", "(1) Machine cable（Fixed type）"),
    ("Appendix 1 ：Classification of functions using external input/output signals" , "Appendix 1 ： Classification of functions using external input/output signals")
]
chapter_name = "(1) Machine cable （Fixed type）"
chapter_toc_name="(1) Machine cable（Fixed type）"
for chapter_name, chapter_toc_name in chapter_tuples:
    chapter_index = -1
    for index, chunk in enumerate(splitted):
        if chunk.find(chapter_name) > 10: # Chapter name, if found on the beginning of string, is not a correct one
            print(f"Text: {chapter_name} found in document on index ={index} on position {chunk.find(chapter_name)}")
            chapter = chunk
            chapter_index = index
            splitted.pop(index)
            break
    if chapter_index == -1:
        print(f"Text: {chapter_name} not found in document")
    else:
        chapters = chapter.split(chapter_name,1)
        chapters[-1] = chapter_toc_name + chapters[-1]
        for chapter in chapters[::-1]:
            splitted.insert(chapter_index,chapter)
        
    

Text: (1) Machine cable （Fixed type） found in document on index =50 on position 190
Text: Appendix 1 ：Classification of functions using external input/output signals found in document on index =131 on position 14


In [8]:
pathlib.Path("data/chaos.md").write_bytes(chaos.encode())

8718

In [9]:
len(splitted)

133

In [17]:
chunk_dict = { chunk.split('\n',1)[0] : chunk for chunk in splitted}

In [16]:
toc = doc.get_toc()

In [18]:
retries = []
for chapter in toc:
    title = chapter[1]
    result = list(filter(lambda i: title in i[0], chunk_dict.items()))
    if len(result) == 0:
        print(f"Found {len(result)} matches for {title}")
        
    elif len(result) > 1:
        retries.append(chapter)
        print(f"Found {len(result)} matches for {title}")
    else:
        content = result[0][-1]
        chapter.append(content)
        chunk_dict.pop(result[0][0])

for chapter in retries:
    title = chapter[1]
    result = list(filter(lambda i: title in i[0], chunk_dict.items()))
    if len(result) == 0:
        print(f"Found {len(result)} matches for {title}")
        
    elif len(result) > 1:
        retries.append(title)
        print(f"Found {len(result)} matches for {title}")
    else:
        print(f"Found finally {len(result)} matches for {title}")
        content = result[0][-1]
        chapter.append(content)
        chunk_dict.pop(result[0][0])
assert len(chunk_dict) == 0

Found 2 matches for (1) RH-3CH-Sxx
Found 2 matches for 6 Safety
Found finally 1 matches for (1) RH-3CH-Sxx
Found finally 1 matches for 6 Safety


In [19]:
chunk_dict

{}

In [20]:
toc[:3]

[[1, '1 General configuration', 14, '# 1 General configuration\n'],
 [2,
  '1.1 Structural equipment',
  14,
  '## 1.1 Structural equipment\n\nStructural equipment consists of the following types.\n'],
 [3,
  '1.1.1 Standard structural equipment',
  14,
  '### 1.1.1 Standard structural equipment\n\nThe following items are enclosed as a standard.\n(1) Robot arm\n(2) Controller\n(3) Machine cable\n(4) Robot arm installation bolts\n(5) Safety manual, CD-ROM (Instruction manual)\n(6) Guarantee card\n']]

In [21]:
import json
json_file=json.dumps(toc,indent=2)

In [24]:
pathlib.Path("data/structured_file.txt").write_text(json_file)


245730

In [33]:
data[:3]

[[1, '1 General configuration', 14, '# 1 General configuration\n'],
 [2,
  '1.1 Structural equipment',
  14,
  '## 1.1 Structural equipment\n\nStructural equipment consists of the following types.\n'],
 [3,
  '1.1.1 Standard structural equipment',
  14,
  '### 1.1.1 Standard structural equipment\n\nThe following items are enclosed as a standard.\n(1) Robot arm\n(2) Controller\n(3) Machine cable\n(4) Robot arm installation bolts\n(5) Safety manual, CD-ROM (Instruction manual)\n(6) Guarantee card\n']]

In [25]:
json_read = pathlib.Path("data/structured_file.txt").read_text()

In [26]:
data = json.loads(json_read)

In [34]:
def build_hierarchy(flat_data):
    root = []
    stack = deque()  # stack to track current parent chain

    for item in flat_data:
        node = {
            "level": item[0],
            "chapter": item[1],
            "page": item[2],
            "content": item[3], 
            "children": []
        }

        # If stack is empty or current node is top-level
        if not stack:
            root.append(node)
            stack.append(node)
        else:
            # Pop until we find a parent node with lower level
            while stack and stack[-1]["level"] >= node["level"]:
                stack.pop()

            if stack:
                stack[-1]["children"].append(node)
            else:
                root.append(node)

            stack.append(node)

    return root


In [35]:
structured_data = build_hierarchy(data)

In [40]:
def get_full_text(chapter):
    texts = [chapter["content"]]
    for child in chapter.get("children", []):
        texts.append(get_full_text(child))
    return "\n".join(texts)

In [46]:
print(get_full_text(structured_data[2]['children'][1]))

## 3.2 Names of each part

### 3.2.1 Controller

Controller (Front side)

|<4> <15> <9> <10> <7> nt side)|Col2|Col3|Col4|Col5|Col6|
|---|---|---|---|---|---|
|<4><br><15><br><9><br><10><br><7>|<4><br><15><br><9><br><10><br><7>|<4><br><15><br><9><br><10><br><7>|<4><br><15><br><9><br><10><br><7>|<4><br><15><br><9><br><10><br><7>|<4><br><15><br><9><br><10><br><7>|
|||||||
|||||||
|||||||
|<3><br><5><br><6> <14> <13> <12><11><br><8>|<3><br><5><br><6> <14> <13> <12><11><br><8>|<3><br><5><br><6> <14> <13> <12><11><br><8>|<3><br><5><br><6> <14> <13> <12><11><br><8>|<3><br><5><br><6> <14> <13> <12><11><br><8>|<3><br><5><br><6> <14> <13> <12><11><br><8>|


![](data/images/bfp-a3447q.pdf-59-2.png)


Controller (Rear side)

![](data/images/bfp-a3447q.pdf-59-3.png)


Fig.3-1 ：Names of controller parts (CR751)


<1> ACIN connector.......................................The connector for AC power source (single phase, 200 VAC) input. (a
socket housing and a terminal are attached).
Refer to the separa