%%capture
!pip install pymupdf4llm

In [1]:
import pymupdf, pymupdf4llm
from pprint import pprint
import pathlib
from collections import deque

In [7]:
input_filename = "data/bfp-a3447q.pdf"
output_filename= input_filename.split('.')[0]+'.txt'
image_path = "data/images"
margins=(50,75)
md = ""

doc = pymupdf.open(input_filename)  # use a Document for subsequent processing
my_headers = pymupdf4llm.TocHeaders(doc)  # use the table of contents for determining headers
content_first_page = doc.get_toc()[0][-1] -1 # 0-based page number


In [3]:
doc.metadata

{'format': 'PDF 1.7',
 'title': 'RH-3CH-Sxx/RH-6CH-Sxx Special Specifications Manual',
 'author': 'Mitsubishi Electric Corporation',
 'subject': 'BFP-A3447-Q',
 'keywords': 'RH-3CH; RH-6CH',
 'creator': 'FrameMaker 2015.0.5',
 'producer': 'Acrobat Distiller 23.0 (Windows)',
 'creationDate': "D:20230404144342+09'00'",
 'modDate': "D:20230404144546+09'00'",
 'trapped': '',
 'encryption': None}

In [3]:
for page in doc[content_first_page : doc.page_count]:
    print(f"\rProcessing page no {page.number+1}...", end='')
    clusters = page.cluster_drawings()
    for bb in clusters:
        page.draw_rect(bb, width=0.2)  # put extra border around detected graphics
    md += pymupdf4llm.to_markdown(
        doc,
        pages=[page.number],
        margins=margins,
        hdr_info=my_headers,
        write_images=True,
        image_path = image_path,
        force_text=False
    )

Processing page no 140...

In [4]:
len(md)

232255

In [4]:
import regex as re
pattern = r"""[^a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|;:'",.<>/?\\`~ \t\n□△◇：±℃φ×Ω（）]"""
anti_pattern = r"""[a-zA-Z0-9!@#$%^&*()_\-+=\[\]{}|;:'",.<>/?\\`~ \t\n□△◇：±℃φ×Ω（）]"""
notes_pattern = r'Note (?=\d\))'
cleaned = re.sub(pattern, "", md)
cleaned = re.sub(notes_pattern, " *Note ", cleaned)
chaos = re.sub(anti_pattern, "", md)
splitted = re.split(r'\n(?=#)',cleaned)
splitted = [re.sub(r'^#+\s','',text) for text in splitted]
pathlib.Path("output.md").write_text(cleaned)

229695

In [5]:
splitted[3]

'1.1.2 Special specifications\n\nFor the special specifications, some standard configuration equipment and specifications have to be changed\nbefore factory shipping. Confirm the delivery date and specify the special specifications at the order.\n'

In [6]:
#Manual correction for the missing chapters
chapter_tuples = [
    ("(1) Machine cable （Fixed type）", "(1) Machine cable（Fixed type）"),
    ("Appendix 1 ：Classification of functions using external input/output signals" , "Appendix 1 ： Classification of functions using external input/output signals")
]
chapter_name = "(1) Machine cable （Fixed type）"
chapter_toc_name="(1) Machine cable（Fixed type）"
for chapter_name, chapter_toc_name in chapter_tuples:
    chapter_index = -1
    for index, chunk in enumerate(splitted):
        if chunk.find(chapter_name) > 10: # Chapter name, if found on the beginning of string, is not a correct one
            print(f"Text: {chapter_name} found in document on index= {index} on position {chunk.find(chapter_name)}")
            chapter = chunk
            chapter_index = index
            splitted.pop(index)
            break
    if chapter_index == -1:
        print(f"Text: {chapter_name} not found in document")
    else:
        chapters = chapter.split(chapter_name,1)
        chapters[-1] = chapter_toc_name + chapters[-1]
        for chapter in chapters[::-1]:
            splitted.insert(chapter_index,chapter)
        
    

Text: (1) Machine cable （Fixed type） found in document on index= 50 on position 187
Text: Appendix 1 ：Classification of functions using external input/output signals found in document on index= 131 on position 12


In [8]:
pathlib.Path("data/chaos.md").write_bytes(chaos.encode())

8718

In [7]:
len(splitted)

133

In [8]:
chunk_dict = { chunk.split('\n',1)[0] : chunk for chunk in splitted}

In [9]:
toc = doc.get_toc()

In [10]:
retries = []
for chapter in toc:
    title = chapter[1]
    result = list(filter(lambda i: title in i[0], chunk_dict.items()))
    if len(result) == 0:
        print(f"Found {len(result)} matches for {title}")
        
    elif len(result) > 1:
        retries.append(chapter)
        print(f"Found {len(result)} matches for {title}")
    else:
        content = result[0][-1]
        chapter.append(content)
        chunk_dict.pop(result[0][0])

for chapter in retries:
    title = chapter[1]
    result = list(filter(lambda i: title in i[0], chunk_dict.items()))
    if len(result) == 0:
        print(f"Found {len(result)} matches for {title}")
        
    elif len(result) > 1:
        retries.append(title)
        print(f"Found {len(result)} matches for {title}")
    else:
        print(f"Found finally {len(result)} matches for {title}")
        content = result[0][-1]
        chapter.append(content)
        chunk_dict.pop(result[0][0])
assert len(chunk_dict) == 0

Found 2 matches for (1) RH-3CH-Sxx
Found 2 matches for 6 Safety
Found finally 1 matches for (1) RH-3CH-Sxx
Found finally 1 matches for 6 Safety


In [16]:
chunk_dict

{}

In [12]:
toc[:4]

[[1, '1 General configuration', 14, '1 General configuration\n'],
 [2,
  '1.1 Structural equipment',
  14,
  '1.1 Structural equipment\n\nStructural equipment consists of the following types.\n'],
 [3,
  '1.1.1 Standard structural equipment',
  14,
  '1.1.1 Standard structural equipment\n\nThe following items are enclosed as a standard.\n(1) Robot arm\n(2) Controller\n(3) Machine cable\n(4) Robot arm installation bolts\n(5) Safety manual, CD-ROM (Instruction manual)\n(6) Guarantee card\n'],
 [3,
  '1.1.2 Special specifications',
  14,
  '1.1.2 Special specifications\n\nFor the special specifications, some standard configuration equipment and specifications have to be changed\nbefore factory shipping. Confirm the delivery date and specify the special specifications at the order.\n']]

In [17]:
import json
json_file=json.dumps(toc,indent=2)

In [21]:
# Exporting file data to output file for storage
with open(output_filename, mode='w+') as f_out:
    f_out.write(json_file)