# pdf split

In [1]:
import os
from pikepdf import Pdf

cameramodel = "x-e4"
pdf_path = f"../data/raw_data/{cameramodel}/{cameramodel}.pdf"

basename = os.path.basename(pdf_path)
splitname = os.path.splitext(basename)[0]

raw_dir = "../data/raw_data"
output_dir = os.path.join(raw_dir, splitname)
detail_output_dir = os.path.join(output_dir, f"{splitname}_split")

if not os.path.exists(detail_output_dir):
    os.makedirs(detail_output_dir)

pdf = Pdf.open(pdf_path)
for n, page in enumerate(pdf.pages):
    new_pdf = Pdf.new()
    new_pdf.pages.append(page)

    output_filename = os.path.join(detail_output_dir, splitname+f"_page{n+1}.pdf")

    new_pdf.save(output_filename)
    print(f"[+] File: {output_filename} saved.")

[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page1.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page2.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page3.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page4.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page5.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page6.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page7.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page8.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page9.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page10.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page11.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page12.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page13.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page14.pdf saved.
[+] File: ../data/raw_data/x-e4/x-e4_split/x-e4_page15.pdf saved.
[+] File: ../data/r

# Llama Parase Multimodal

In [2]:
import os
import json
from dotenv import load_dotenv
from llama_parse import LlamaParse
from glob import glob

In [3]:
load_dotenv(dotenv_path="../.env", override=True)

Python-dotenv could not parse statement starting at line 2


True

In [4]:
parsing_instruction='''You are a highly proficient language model with advanced multimodal parsing capabilities. Your task is to process and extract content from a PDF document that serves as a camera manual.

**Specific Requirements:**

1. **Text Extraction:**  
   Extract all textual content from the document in a structured Markdown format. Preserve the hierarchy and formatting, including headings, subheadings, bullet points, numbered lists, and paragraphs.

2. **Image Identification and Inclusion:**  
   - Identify any images present in the document.  
   - For each image, insert a Markdown reference in the appropriate location using the syntax:  
     `![Image Description](ImagePlaceholder)`

3. **General Guidelines:**  
   - Maintain logical and clean formatting for Markdown output.
   - Avoid redundant or non-informative text (e.g., "Page number" or "Header/Footer").
   - Ensure extracted text and image references align with their original positions in the document.

**Example Output Structure:**  

# SHOOTING SETTING (Still Photography)
Adjust shooting options for still photography.

To display shooting settings, press **MENU/OK** in the photo shooting display and select the **SHOOTING SETTING** tab.

![Shooting Settings Menu](ImagePlaceholder)

> 💡 **Note:**
> - The options available vary with the shooting mode selected.

## SELF-TIMER
Choose a shutter release delay.

| Option   | Description                                                                                                         |
|----------|---------------------------------------------------------------------------------------------------------------------|
| 2 SEC | The shutter is released two seconds after the shutter button is pressed. Use to reduce blur caused by the camera moving when the shutter button is pressed. The self-timer lamp blinks as the timer counts down.  |
| 10 SEC| The shutter is released ten seconds after the shutter button is pressed. Use for photographs in which you wish to appear yourself. The self-timer lamp blinks immediately before the picture is taken. |
| OFF      | Self-timer off.                                                                                                    |

If an option other than **OFF** is selected, the timer will start when the shutter button is pressed all the way down. The display shows the number of seconds remaining until the shutter is released. To stop the timer before the picture is taken, press **DISP/BACK**.

![Timer Display](ImagePlaceholder)

> 💡 **Note:**
> - Stand behind the camera when using the shutter button. Standing in front of the lens can interfere with focus and exposure.
> - The self-timer turns off automatically when the camera is turned off.'''

parser = LlamaParse(
    api_key = os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt4o",
    vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
    parsing_instruction = parsing_instruction,
    invalidate_cache=True
)

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
camera_model = "x-e4"
splited_pdf_path = f"../data/raw_data/{camera_model}/{camera_model}_split/*.pdf"
pdf_list = glob(splited_pdf_path)

json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
if not os.path.exists(detail_json_dir):
    os.makedirs(detail_json_dir)

for pdf in pdf_list:
    json_objs = parser.get_json_result(pdf)

    file_path, file_name = os.path.split(pdf)
    name = os.path.splitext(file_name)[0]
    json_path = os.path.join(detail_json_dir, name+".json")
    
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(json_objs[0], file, ensure_ascii=False, indent=4)

Started parsing the file under job_id 6391c917-5431-49f6-a9cb-9aa6e7ed847a
Started parsing the file under job_id f1dd721e-3a06-4a5b-accc-f53affafd330
Started parsing the file under job_id 8d03821d-6c3b-4972-995a-98fb5d99e870
Started parsing the file under job_id 92ceae71-827a-4c62-a47e-728cac518b3e
Started parsing the file under job_id 88230244-4998-45c6-85ae-d49bcf16777f
Started parsing the file under job_id b63a6ce6-69da-4e12-96a4-fb77c2258be5
Started parsing the file under job_id 3acc3896-fdbb-4a75-88ac-901d80b492f9
Started parsing the file under job_id 4fffc6c4-346f-4072-8ace-cf3ccb9ffd70
Started parsing the file under job_id ffb59caf-7dc3-44fa-935f-545a2e15e1ea
Started parsing the file under job_id dd4eb025-a13e-4a61-8489-47fcba9f56f5
Started parsing the file under job_id 23dafe9d-56bc-441a-b922-7ac4d42ef9d4
Started parsing the file under job_id a1d28ae9-623d-4aaf-a97e-572a2979cb7a
Started parsing the file under job_id 8a0f6a40-b771-4067-bfb2-d20c96fe86ca
Started parsing the file 

IndexError: list index out of range

In [25]:
# error 
# ---------------------------------------------------------------------------
# IndexError                                Traceback (most recent call last)
# Cell In[6], line 18
#      15 json_path = os.path.join(detail_json_dir, name+".json")
#      17 with open(json_path, "w", encoding="utf-8") as file:
# ---> 18     json.dump(json_objs[0], file, ensure_ascii=False, indent=4)

# IndexError: list index out of range
from glob import glob

splited_pdf_files = glob("../data/raw_data/x-e4/x-e4_split/*.pdf")
json_files = glob("../data/json/x-e4/LlamaParseMultimodal/init_result/*.json")

splited_pdf_len = len(splited_pdf_files)
json_len = len(json_files)
print(splited_pdf_len, json_len)

json_name = []
for i in json_files:
    json_name.append(os.path.splitext(os.path.basename(i))[0])

pdf_name = []
for i in splited_pdf_files:
    pdf_name.append(os.path.splitext(os.path.basename(i))[0])

missing_page = [item for item in pdf_name if item not in json_name]
print(missing_page)
print(len(missing_page))


360 312
['x-e4_page261', 'x-e4_page119', 'x-e4_page282', 'x-e4_page313', 'x-e4_page325', 'x-e4_page266', 'x-e4_page106', 'x-e4_page49', 'x-e4_page105', 'x-e4_page75', 'x-e4_page143', 'x-e4_page352', 'x-e4_page249', 'x-e4_page256', 'x-e4_page124', 'x-e4_page304', 'x-e4_page329', 'x-e4_page232', 'x-e4_page72', 'x-e4_page109', 'x-e4_page198', 'x-e4_page246', 'x-e4_page162', 'x-e4_page26', 'x-e4_page317', 'x-e4_page142', 'x-e4_page251', 'x-e4_page214', 'x-e4_page206', 'x-e4_page185', 'x-e4_page324', 'x-e4_page330', 'x-e4_page71', 'x-e4_page320', 'x-e4_page45', 'x-e4_page181', 'x-e4_page167', 'x-e4_page70', 'x-e4_page318', 'x-e4_page74', 'x-e4_page220', 'x-e4_page104', 'x-e4_page306', 'x-e4_page243', 'x-e4_page153', 'x-e4_page111', 'x-e4_page148', 'x-e4_page27']
48


In [28]:
camera_model = "x-e4"
splited_pdf_path = f"../data/raw_data/{camera_model}/{camera_model}_split/*.pdf"
pdf_list = glob(splited_pdf_path)

json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
if not os.path.exists(detail_json_dir):
    os.makedirs(detail_json_dir)

missing_pdf_list = []
for file_name in missing_page:
    missing_pdf_list.append(os.path.join("../data/raw_data/x-e4/x-e4_split", f"{file_name}.pdf"))

for pdf in missing_pdf_list:
    json_objs = parser.get_json_result(pdf)

    file_path, file_name = os.path.split(pdf)
    name = os.path.splitext(file_name)[0]
    json_path = os.path.join(detail_json_dir, name+".json")
    
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(json_objs[0], file, ensure_ascii=False, indent=4)

Started parsing the file under job_id 30aa337e-7376-490d-a766-34659494e8cb
Started parsing the file under job_id a63fb4b5-b54f-442f-bc3e-618f18435494
Started parsing the file under job_id c32e9083-772f-471f-9896-e74300035aec
Started parsing the file under job_id ebc7f7a6-284f-414f-a400-de04f2a4676c
Started parsing the file under job_id 2761b174-8c91-407c-aae5-f92fb2aaed05
Started parsing the file under job_id def3119c-70cb-44bd-b94b-7d955ebb858b
Started parsing the file under job_id d7b4e4d0-f90c-4452-ae0f-89a3107aeb7e
Started parsing the file under job_id 995663a5-2f09-4c12-bb17-a16d2f0ebd0a
Started parsing the file under job_id 27c71c9e-7b29-49b7-a4bc-7ca5d38784f0
Started parsing the file under job_id 856f4cb2-af26-4ee6-a248-bb63586ed983
Started parsing the file under job_id bdadb47b-3392-4233-b236-542b8c54c32a
Started parsing the file under job_id 1ffa3f15-6409-45f2-b14c-2e4efb0f20b3
Started parsing the file under job_id ff4c2909-c380-4125-9a14-63ccc0179f0e
Started parsing the file 

In [29]:
from glob import glob

splited_pdf_files = glob("../data/raw_data/x-e4/x-e4_split/*.pdf")
json_files = glob("../data/json/x-e4/LlamaParseMultimodal/init_result/*.json")

splited_pdf_len = len(splited_pdf_files)
json_len = len(json_files)
print(splited_pdf_len, json_len)

json_name = []
for i in json_files:
    json_name.append(os.path.splitext(os.path.basename(i))[0])

pdf_name = []
for i in splited_pdf_files:
    pdf_name.append(os.path.splitext(os.path.basename(i))[0])

missing_page = [item for item in pdf_name if item not in json_name]
print(missing_page)
print(len(missing_page))

360 360
[]
0


# Modify Json
- parsing_result
- metadata: {
    "model":,
    "page":
    }

In [1]:
import os
import re
import json
from glob import glob

from CameraDocument import CameraDocument

In [2]:
def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

In [3]:
camera_model = "x-e4"
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
init_json_path = os.path.join(detail_json_dir, "*.json")

json_list = glob(init_json_path)
json_list = sorted(json_list, key=extract_number)

In [5]:
output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for json_path in json_list:
    doc = CameraDocument()

    page_num = extract_number(json_path)
    if page_num >= 25:
        real_page_num = page_num - 24
        doc.metadata['page'] = real_page_num
        basename = camera_model + f"_page{real_page_num}.json"
        output_path = os.path.join(output_dir, basename)
    else:
        doc.metadata['page'] = page_num
        basename = camera_model + f"_front_page{page_num}.json"
        front_output_dir = os.path.join(output_dir, "front_page")
        output_path = os.path.join(front_output_dir, basename)
        if not os.path.exists(front_output_dir):
            os.makedirs(front_output_dir)

    doc.metadata['model'] = camera_model

    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)
        doc.parsing_result = json_data["pages"][0]["md"]
        doc.save_json(output_path)


./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page1.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page2.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page3.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page4.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page5.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page6.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page7.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page8.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page9.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_front_page10.json complete
./data/json/x-e4/LlamaParseMultimodal/add_metadata/front_page/x-e4_fr

# Index page to .png

In [7]:
import os
import fitz

In [8]:
def extract_pages_as_images(pdf_path, output_folder, start_page, end_page, dpi=600):
    pdf_document = fitz.open(pdf_path)

    # Loop through the specified page range
    for page_number in range(start_page - 1, end_page):
        # Check if the page number is within bounds
        if page_number < 0 or page_number >= len(pdf_document):
            print(f"Page {page_number + 1} is out of range. Skipping.")
            continue

        # Load the page
        page = pdf_document[page_number]

        # Render the page to a pixmap with specified DPI
        pix = page.get_pixmap(dpi=dpi)

        # Define the output image path
        output_path = f"{output_folder}/{camera_model}_page{page_number + 1}.png"

        # Save the image
        pix.save(output_path)
        print(f"Saved page {page_number + 1} as {output_path}")

    # Close the PDF document
    pdf_document.close()

In [9]:
camera_model = 'x-e4'
image_dir = './data/image'
output_dir = os.path.join(image_dir, camera_model, "index_images")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
pdf_path = f"./data/raw_data/{camera_model}/{camera_model}.pdf"

In [10]:
extract_pages_as_images(pdf_path, output_dir, 11, 22, 600)

Saved page 11 as ./data/image/x-e4/index_images/x-e4_page11.png
Saved page 12 as ./data/image/x-e4/index_images/x-e4_page12.png
Saved page 13 as ./data/image/x-e4/index_images/x-e4_page13.png
Saved page 14 as ./data/image/x-e4/index_images/x-e4_page14.png
Saved page 15 as ./data/image/x-e4/index_images/x-e4_page15.png
Saved page 16 as ./data/image/x-e4/index_images/x-e4_page16.png
Saved page 17 as ./data/image/x-e4/index_images/x-e4_page17.png
Saved page 18 as ./data/image/x-e4/index_images/x-e4_page18.png
Saved page 19 as ./data/image/x-e4/index_images/x-e4_page19.png
Saved page 20 as ./data/image/x-e4/index_images/x-e4_page20.png
Saved page 21 as ./data/image/x-e4/index_images/x-e4_page21.png
Saved page 22 as ./data/image/x-e4/index_images/x-e4_page22.png


# Delete
- chapter cover page
- memo page

In [132]:
from CameraDocument import CameraDocument

camera_model = 'x-e4'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata", "front_page")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    if "front" in path:
        document = CameraDocument()
        document.load_json(path)
        documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'x-e4'}, parsing_result="!Fujifilm X-E4 Owner's Manual Cover\n\n# FUJIFILM\n\n## DIGITAL CAMERA X-E4\n\nFF200002  \nOwner’s Manual\n\nBL00005102-202 EN", embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'x-e4'}, parsing_result='# Introduction\n\nThank you for your purchase of this product. Be sure that you have read this manual and understood its contents before using the camera. Keep the manual where it will be read by all who use the product.\n\n## For the Latest Information\n\nThe latest versions of the manuals are available from:\n\nhttp://fujifilm-dsc.com/en/manual/\n\n!QR Code for Manual\n\nThe site can be accessed not only from your computer but also from smartphones and tablets. It also contains information on the software license.\n\nFor information on firmware updates, visit:  \nhttps://fujifilm-x.com/support/download/firmware/cameras/\n\n!Devices', embedding_result=[]),
 CameraDocument(metadata={'page': 3, 'mod

In [133]:
for document in documents:
    if document.metadata['page'] == 22:
        print(document.parsing_result)


# Technical Notes

## Accessories from Fujifilm
- Page 298

## Software for Use with Your Camera
- FUJIFILM Camera Remote .................................................. 301
- FUJIFILM PC AutoSave ....................................................... 301
- RAW FILE CONVERTER EX powered by SILKYPIX .......... 301
- Capture One Express Fujifilm ........................................... 301
- Capture One Pro Fujifilm .................................................. 302
- FUJIFILM X Acquire .......................................................... 302
- FUJIFILM X RAW STUDIO .................................................. 302
- FUJIFILM X Webcam ......................................................... 302

## For Your Safety
- Page 303

## Product Care
- Page 312

## Cleaning the Image Sensor
- Page 313

## Firmware Updates
- Checking the Firmware Version ...................................... 314

## Troubleshooting
- Page 315

- Page 324

## Memory Card Capacity
- Page 328

In [187]:
from CameraDocument import CameraDocument
import os
from glob import glob
import re

camera_model = 'x-e4'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    document = CameraDocument()
    document.load_json(path)
    documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'x-e4'}, parsing_result='!Before You Begin', embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'x-e4'}, parsing_result='\n# Parts of the Camera\n\nThe parts of the camera are listed below.\n\n!Camera Diagram\n\n1. **Shutter button** .................................................. 44\n2. **Exposure compensation dial** .................. 4, 80\n3. **Q (quick menu) button** ............................. 247\n4. **Shutter speed dial** .................................... 4, 56, 59, 63, 65\n5. **Hot shoe** ..................................................... 267\n6. **Microphone** ............................................... 50, 175\n7. **Strap eyelet** ................................................. 28\n8. **Connector cover**\n9. **Lens signal contacts**\n10. **Lens release button** .................................. 29\n11. **AF-assist illuminator** ................................. 131\n    - Self-timer lamp .......

In [188]:
x_e4_contents = {
    "Before You Begin": {
        "page_range": [
            1,
            27
        ],
        "sections": {
            "Parts of the Camera": {
                "page_range": [
                    2,
                    10
                ],
                "subsections": {
                    "The Shutter Speed Dial": [4, 5],
                    "The Exposure Compensation Dial": [4, 5],
                    "The Focus Stick (Focus Lever)": [5, 6],
                    "The DRIVE/DELETE Button": [5, 6],
                    "The Command Dial": [6, 7],
                    "The Indicator Lamp": [7, 8],
                    "The LCD Monitor": [8, 9],
                    "Focusing the Viewfinder": [9, 10]
                }
            },
            "Camera Displays": {
                "page_range": [
                    10,
                    19
                ],
                "subsections": {
                    "The Electronic Viewfinder": [10, 12],
                    "The LCD Monitor": [12, 14],
                    "Adjusting Display Brightness": [14, 16],
                    "Display Rotation": [14, 16],
                    "Choosing a Display Mode": [14, 16],
                    "The DISP/BACK Button": [16, 17],
                    "Customizing the Standard Display": [17, 19]
                }
            },
            "Using the Menus": {
                "page_range": [
                    19,
                    21
                ],
                "subsections": {
                    "The Menus": [19, 20],
                    "Selecting a Menu Tab": [20, 21]
                }
            },
            "Touch Screen Mode": {
                "page_range": [
                    21,
                    27
                ],
                "subsections": {
                    "Shooting Touch Controls": [21, 25],
                    "Playback Touch Controls": [25, 27]
                }
            }
        }
    },
    "First Steps": {
        "page_range": [
            27,
            41
        ],
        "sections": {
            "Attaching the Strap": {
                "page_range": [
                    28,
                    29
                ],
            },
            "Attaching a Lens": {
                "page_range": [
                    29,
                    30
                ],
            },
            "Inserting the Battery and a Memory Card": {
                "page_range": [
                    30,
                    33
                ],
                "subsections": {
                    "Compatible Memory Cards": [32, 33]
                }
            },
            "Charging the Battery": {
                "page_range": [
                    33,
                    35
                ],
            },
            "Turning the Camera On and Off": {
                "page_range": [
                    35,
                    36
                ],
            },
            "Checking the Battery Level": {
                "page_range": [
                    36,
                    37
                ],
            },
            "Basic Setup": {
                "page_range": [
                    37,
                    41
                ],
                "subsections": {
                    "Choosing a Different Language": [39, 41],
                    "Changing the Time and Date": [39, 41]
                }
            }
        }
    },
    "Basic Photography and Playback": {
        "page_range": [
            41,
            47
        ],
        "sections": {
            "Taking Photographs (Mode P)": {
                "page_range": [42, 45],
            },
            "Viewing Pictures": {
                "page_range": [45, 46],
            },
            "Deleting Pictures": {
                "page_range": [46, 47],
            }
        }
    },
    "Movie Recording and Playback": {
        "page_range": [
            47,
            55
        ],
        "sections": {
            "Recording Movies": {
                "page_range": [48, 52],
                "subsections": {
                    "Recording Movies": [48, 51],
                    "Adjusting Movie Settings": [51, 52],
                },
            },
            "Viewing Movies": {
                "page_range": [52, 55],
            }
        }
    },
    "Taking Photographs": {
        "page_range": [
            55,
            103
        ],
        "sections": {
            "P, S, A, and M Modes": {
                "page_range": [56, 68],
                "subsections": {
                    "Mode P: Program AE": [56, 59],
                    "Mode S: Shutter-Priority AE": [59, 63],
                    "Mode A: Aperture-Priority AE": [63, 65],
                    "Mode M: Manual Exposure": [65, 67],
                    "Front Command Dial Roles by Mode": [67, 68],
                },
            },
            "Autofocus": {
                "page_range": [68, 77],
                "subsections": {
                    "Focus Mode": [69, 71],
                    "Autofocus Options (AF Mode)": [71, 73],
                    "Focus-Point Selection": [73, 77],
                },
            },
            "Manual Focus": {
                "page_range": [77, 80],
                "subsections": {
                    "Checking Focus": [78, 80],
                },
            },
            "Exposure Compensation": {
                "page_range": [80, 82],
                "subsections": {
                    "C (Custom)": [81, 82],
                },
            },
            "Sensitivity": {
                "page_range": [82, 84],
                "subsections": {
                    "Auto Sensitivity (A)": [83, 84],
                },
            },
            "Metering": {
                "page_range": [84, 85],
            },
            "Focus/Exposure Lock": {
                "page_range": [85, 87],
                "subsections": {
                    "Other Controls": [86, 87],
                },
            },
            "Bracketing": {
                "page_range": [87, 92],
                "subsections": {
                    "ISO BKT": [88, 89],
                    "White Balance BKT": [88, 89],
                    "BKT Bracketing": [89, 92],
                },
            },
            "Continuous Shooting (Burst Mode)": {
                "page_range": [92, 93],
            },
            "HDR": {
                "page_range": [93, 95],
            },
            "Panoramas": {
                "page_range": [95, 98],
            },
            "Multiple Exposures": {
                "page_range": [98, 100],
            },
            "Filters": {
                "page_range": [100, 101],
            },
            "Self-Portraits (Selfies)": {
                "page_range": [101, 103],
            }
        }
    },
    "The Shooting Menus": {
        "page_range": [
            103,
            181
        ],
        "sections": {
            "IMAGE QUALITY SETTING (Still Photography)": {
                "page_range": [104, 125],
                "subsections": {
                    "IMAGE SIZE": [104, 105],
                    "IMAGE QUALITY": [105, 106],
                    "RAW RECORDING": [106, 108],
                    "FILM SIMULATION": [106, 108],
                    "MONOCHROMATIC COLOR": [108, 109],
                    "GRAIN EFFECT": [108, 109],
                    "COLOR CHROME EFFECT": [108, 109],
                    "COLOR CHROME FX BLUE": [109, 113],
                    "WHITE BALANCE": [109, 113],
                    "DYNAMIC RANGE": [113, 114],
                    "D RANGE PRIORITY": [114, 115],
                    "TONE CURVE": [114, 115],
                    "COLOR": [115, 116],
                    "SHARPNESS": [115, 116],
                    "HIGH ISO NR": [115, 116],
                    "CLARITY": [116, 117],
                    "LONG EXPOSURE NR": [116, 117],
                    "LENS MODULATION OPTIMIZER": [116, 117],
                    "COLOR SPACE": [117, 118],
                    "PIXEL MAPPING": [117, 118],
                    "SELECT CUSTOM SETTING": [118, 122],
                    "EDIT/SAVE CUSTOM SETTING": [118, 122],
                    "AUTO UPDATE CUSTOM SETTING": [122, 123],
                    "MOUNT ADAPTER SETTING": [123, 125]
                }
            },
            "AF/MF SETTING (Still Photography)": {
                "page_range": [125, 140],
                "subsections": {
                    "FOCUS AREA": [125, 126],
                    "FOCUS MODE": [125, 126],
                    "AF MODE": [125, 126],
                    "AF-C CUSTOM SETTINGS": [126, 130],
                    "STORE AF MODE BY ORIENTATION": [130, 131],
                    "AF POINT DISPLAY": [130, 131],
                    "NUMBER OF FOCUS POINTS": [130, 131],
                    "PRE-AF": [131, 132],
                    "AF ILLUMINATOR": [131, 132],
                    "FACE/EYE DETECTION SETTING": [132, 134],
                    "AF+MF": [134, 135],
                    "MF ASSIST": [134, 135],
                    "FOCUS CHECK": [135, 136],
                    "INTERLOCK SPOT AE & FOCUS AREA": [136, 137],
                    "INSTANT AF SETTING": [136, 137],
                    "DEPTH-OF-FIELD SCALE": [136, 137],
                    "RELEASE/FOCUS PRIORITY": [137, 138],
                    "AF RANGE LIMITER": [137, 138],
                    "TOUCH SCREEN MODE": [138, 140]
                }
            },
            "SHOOTING SETTING (Still Photography)": {
                "page_range": [140, 149],
                "subsections": {
                    "SPORTS FINDER MODE": [140, 141],
                    "PRE-SHOT ES": [141, 142],
                    "SELF-TIMER": [141, 142],
                    "SAVE SELF-TIMER SETTING": [142, 143],
                    "SELF-TIMER LAMP": [142, 143],
                    "INTERVAL TIMER SHOOTING": [143, 144],
                    "INTERVAL TIMER SHOOTING EXPOSURE SMOOTHING": [144, 145],
                    "AE BKT SETTING": [144, 145],
                    "FILM SIMULATION BKT": [145, 146],
                    "FOCUS BKT SETTING": [145, 146],
                    "MULTIPLE EXPOSURE CTRL": [145, 146],
                    "PHOTOMETRY": [145, 146],
                    "SHUTTER TYPE": [146, 147],
                    "FLICKER REDUCTION": [147, 148],
                    "IS MODE": [147, 148],
                    "ISO": [148, 149],
                    "WIRELESS COMMUNICATION": [148, 149]
                }
            },
            "FLASH SETTING (Still Photography)": {
                "page_range": [149, 152],
                "subsections": {
                    "FLASH FUNCTION SETTING": [149, 150],
                    "RED EYE REMOVAL": [149, 150],
                    "TTL-LOCK MODE": [150, 151],
                    "LED LIGHT SETTING": [150, 151],
                    "COMMANDER SETTING": [151, 152],
                    "CH SETTING": [151, 152]
                }
            },
            "MOVIE SETTING (Still Photography)": {
                "page_range": [152, 155],
                "subsections": {
                    "MOVIE MODE": [152, 153],
                    "FILE FORMAT": [152, 153],
                    "FULL HD HIGH SPEED REC": [152, 153],
                    "IS MODE": [152, 153],
                    "AUDIO SETTING": [153, 154],
                    "MIC/REMOTE RELEASE": [154, 155]
                }
            },
            "MOVIE SETTING (Movie Recording)": {
                "page_range": [155, 164],
                "subsections": {
                    "MOVIE MODE": [155, 156],
                    "FILE FORMAT": [156, 157],
                    "FULL HD HIGH SPEED REC": [157, 158],
                    "FIX MOVIE CROP MAGNIFICATION": [158, 159],
                    "F-Log RECORDING": [158, 159],
                    "4K MOVIE OUTPUT": [159, 160],
                    "FULL HD MOVIE OUTPUT": [159, 160],
                    "HDMI OUTPUT INFO DISPLAY": [160, 161],
                    "4K HDMI STANDBY QUALITY": [160, 161],
                    "HDMI REC CONTROL": [160, 161],
                    "PHOTOMETRY": [160, 162],
                    "IS MODE": [161, 162],
                    "ISO": [161, 162],
                    "ZEBRA SETTING": [162, 163],
                    "ZEBRA LEVEL": [162, 163],
                    "MOVIE OPTIMIZED CONTROL": [162, 163],
                    "TALLY LIGHT": [163, 164],
                    "WIRELESS COMMUNICATION": [163, 164]
                }
            },
            "IMAGE QUALITY SETTING (Movie Recording)": {
                "page_range": [164, 168],
                "subsections": {
                    "FILM SIMULATION": [164, 165],
                    "MONOCHROMATIC COLOR": [164, 165],
                    "WHITE BALANCE": [164, 165],
                    "DYNAMIC RANGE": [165, 166],
                    "TONE CURVE": [165, 166],
                    "COLOR": [165, 166],
                    "SHARPNESS": [166, 167],
                    "HIGH ISO NR": [166, 167],
                    "INTERFRAME NR": [166, 167],
                    "PERIPHERAL LIGHT CORRECTION": [167, 168],
                    "MOUNT ADAPTER SETTING": [167, 168]
                }
            },
            "AF/MF SETTING (Movie Recording)": {
                "page_range": [168, 175],
                "subsections": {
                    "FOCUS AREA": [168, 169],
                    "FOCUS MODE": [168, 169],
                    "AF MODE": [168, 169],
                    "AF-C CUSTOM SETTING": [169, 170],
                    "AF ILLUMINATOR": [169, 170],
                    "FACE/EYE DETECTION SETTING": [170, 171],
                    "MF ASSIST": [170, 171],
                    "FOCUS CHECK": [171, 172],
                    "INSTANT AF SETTING": [171, 172],
                    "DEPTH-OF-FIELD SCALE": [171, 172],
                    "AF RANGE LIMITER": [171, 172],
                    "TOUCH SCREEN MODE": [172, 174],
                    "FOCUS CHECK LOCK": [174, 175]
                }
            },
            "AUDIO SETTING (Movie Recording)": {
                "page_range": [175, 178],
                "subsections": {
                    "INTERNAL MIC LEVEL ADJUSTMENT": [175, 176],
                    "EXTERNAL MIC LEVEL ADJUSTMENT": [175, 176],
                    "MIC JACK SETTING": [176, 177],
                    "MIC LEVEL LIMITER": [176, 177],
                    "WIND FILTER": [176, 177],
                    "LOW CUT FILTER": [176, 177],
                    "HEADPHONES VOLUME": [177, 178],
                    "MIC/REMOTE RELEASE": [177, 178]
                }
            },
            "TIME CODE SETTING (Movie Recording)": {
                "page_range": [178, 181],
                "subsections": {
                    "TIME CODE DISPLAY": [178, 179],
                    "START TIME SETTING": [178, 179],
                    "COUNT UP SETTING": [178, 179],
                    "DROP FRAME": [179, 181],
                    "HDMI TIME CODE OUTPUT": [179, 181]
                }
            },                        
        }
    },
    "Playback and the Playback Menu": {
        "page_range": [
            181,
            207
        ],
        "sections": {
            "The Playback Display": {
                "page_range": [182, 185],
                "subsections": {
                    "The DISP/BACK Button": [183, 185]
                }
            },
            "Viewing Pictures": {
                "page_range": [185, 187],
                "subsections": {
                    "Playback Zoom": [186, 187],
                    "Multi-Frame Playback": [186, 187],
                    "Viewing Other Pictures": [186, 187]
                }
            },
            "The Playback Menu": {
                "page_range": [187, 207],
                "subsections": {
                    "RAW CONVERSION": [187, 190],
                    "ERASE": [190, 192],
                    "CROP": [192, 193],
                    "RESIZE": [193, 194],
                    "PROTECT": [194, 195],
                    "IMAGE ROTATE": [195, 196],
                    "RED EYE REMOVAL": [196, 197],
                    "VOICE MEMO SETTING": [197, 198],
                    "RATING": [198, 199],
                    "IMAGE TRANSFER ORDER": [199, 200],
                    "WIRELESS COMMUNICATION": [200, 201],
                    "SLIDE SHOW": [200, 201],
                    "PHOTOBOOK ASSIST": [201, 202],
                    "PC AUTO SAVE": [202, 203],
                    "PRINT ORDER (DPOF)": [203, 204],
                    "instax PRINTER PRINT": [204, 205],
                    "DISP ASPECT": [205, 207]
                }
            }
        }
    },
    "The Setup Menus": {
        "page_range": [
            207,
            243
        ],
        "sections": {
            "USER SETTING": {
                "page_range": [208, 213],
                "subsections": {
                    "FORMAT": [208, 209],
                    "DATE/TIME": [209, 210],
                    "TIME DIFFERENCE": [209, 210],
                    "LANG.": [210, 211],
                    "MY MENU SETTING": [210, 211],
                    "SENSOR CLEANING": [211, 212],
                    "SOUND & FLASH": [211, 212],
                    "RESET": [212, 213],
                    "REGULATORY": [212, 213]
                }
            },
            "SOUND SETTING": {
                "page_range": [213, 215],
                "subsections": {
                    "AF BEEP VOL.": [213, 214],
                    "SELF-TIMER BEEP VOL.": [213, 214],
                    "OPERATION VOL.": [214, 215],
                    "SHUTTER VOLUME": [214, 215],
                    "SHUTTER SOUND": [214, 215],
                    "PLAYBACK VOLUME": [214, 215]
                }
            },
            "SCREEN SETTING": {
                "page_range": [215, 224],
                "subsections": {
                    "VIEW MODE SETTING": [215, 216],
                    "EVF BRIGHTNESS": [215, 216],
                    "EVF COLOR": [215, 216],
                    "EVF COLOR ADJUSTMENT": [216, 217],
                    "LCD BRIGHTNESS": [216, 217],
                    "LCD COLOR": [216, 217],
                    "LCD COLOR ADJUSTMENT": [216, 217],
                    "IMAGE DISP.": [217, 218],
                    "AUTOROTATE DISPLAYS": [217, 218],
                    "PREVIEW EXP./WB IN MANUAL MODE": [218, 219],
                    "NATURAL LIVE VIEW": [218, 219],
                    "F-Log VIEW ASSIST": [219, 220],
                    "FRAMING GUIDELINE": [219, 220],
                    "AUTOROTATE PB": [220, 221],
                    "FOCUS SCALE UNITS": [220, 221],
                    "APERTURE UNIT FOR CINEMA LENS": [220, 221],
                    "DISP. CUSTOM SETTING": [220, 221],
                    "LARGE INDICATORS MODE(EVF)": [221, 222],
                    "LARGE INDICATORS MODE(LCD)": [221, 222],
                    "LARGE INDICATORS DISP. SETTING": [222, 223],
                    "INFORMATION CONTRAST ADJ.": [222, 223],
                    "Q MENU BACKGROUND": [223, 224],
                }
            },
            "BUTTON/DIAL SETTING": {
                "page_range": [224, 232],
                "subsections": {
                    "FOCUS LEVER SETTING": [224, 225],
                    "EDIT/SAVE QUICK MENU": [225, 226],
                    "FUNCTION (Fn) SETTING": [225, 226],
                    "COMMAND DIAL SETTING": [226, 227],
                    "SHUTTER AF": [227, 228],
                    "SHUTTER AE": [227, 228],
                    "SHOOT WITHOUT LENS": [227, 228],
                    "SHOOT WITHOUT CARD": [228, 229],
                    "FOCUS RING": [228, 229],
                    "FOCUS RING OPERATION": [228, 229],
                    "AE/AF-LOCK MODE": [229, 230],
                    "AWB-LOCK MODE": [229, 230],
                    "APERTURE RING SETTING (A)": [229, 230],
                    "APERTURE SETTING": [230, 231],
                    "TOUCH SCREEN SETTING": [230, 231]
                }
            },
            "POWER MANAGEMENT": {
                "page_range": [232, 234],
                "subsections": {
                    "AUTO POWER OFF": [232, 233],
                    "PERFORMANCE": [233, 234],
                    "BOOST SETTING": [233, 234]
                }
            },
            "SAVE DATA SETTING": {
                "page_range": [234, 237],
                "subsections": {
                    "FRAME NO.": [234, 235],
                    "SAVE ORG IMAGE": [235, 236],
                    "EDIT FILE NAME": [235, 236],
                    "SELECT FOLDER": [236, 237],
                    "COPYRIGHT INFO": [236, 237]
                }
            },
            "CONNECTION SETTING": {
                "page_range": [237, 243],
                "subsections": {
                    "Bluetooth SETTINGS": [237, 238],
                    "PC AUTO SAVE": [238, 239],
                    "instax PRINTER CONNECTION SETTING": [239, 240],
                    "CONNECTION MODE": [240, 241],
                    "USB POWER SUPPLY SETTING": [241, 242],
                    "GENERAL SETTINGS": [242, 243],
                    "INFORMATION": [242, 243],
                    "RESET WIRELESS SETTING": [242, 243]
                }
            }                       
        }
    },
    "Shortcuts": {
        "page_range": [
            243,
            259
        ],
        "sections": {
            "Shortcut Options": {
                "page_range": [244, 245],
                "subsections": {}
            },
            "MY MENU": {
                "page_range": [245, 247],
                "subsections": {
                    "MY MENU SETTING": [245, 247]
                }
            },
            "The Quick Menu": {
                "page_range": [247, 253],
                "subsections": {
                    "The Quick Menu Display": [247, 249],
                    "Viewing and Changing Settings": [249, 250],
                    "Editing the Quick Menu": [250, 253]
                }
            },
            "Function Controls": {
                "page_range": [253, 259],
                "subsections": {
                    "The Function Buttons": [253, 256],
                    "Touch-Function Gestures": [256, 259]
                }
            }
        }
    },
    "Peripherals and Optional Accessories": {
        "page_range": [
            259,
            281
        ],
        "sections": {
            "Lenses": {
                "page_range": [260, 266],
                "subsections": {
                    "Lens Parts": [260, 261],
                    "Removing Lens Caps": [261, 262],
                    "Attaching Lens Hoods": [261, 262],
                    "Lenses with Aperture Rings": [262, 263],
                    "Lenses with No Aperture Rings": [263, 264],
                    "Lenses with O.I.S. Switches": [263, 264],
                    "Manual Focus Lenses": [264, 265],
                    "Lens Care": [265, 266]
                }
            },
            "External Flash Units": {
                "page_range": [266, 278],
                "subsections": {
                    "Using an External Flash": [267, 268],
                    "EF-X8": [268, 270],
                    "SYNC TERMINAL": [270, 271],
                    "SHOE MOUNT FLASH": [271, 274],
                    "COMMANDER(OPTICAL)": [274, 278]
                }
            },
            "Handgrips": {
                "page_range": [278, 279],
                "subsections": {}
            },
            "Thumb Rests": {
                "page_range": [279, 281],
                "subsections": {}
            }
        }
    },
    "Connections": {
        "page_range": [
            281,
            297
        ],
        "sections": {
            "HDMI Output": {
                "page_range": [282, 284],
                "subsections": {
                    "Connecting to HDMI Devices": [282, 283],
                    "Shooting": [283, 284],
                    "Playback": [283, 284]
                }
            },
            "Wireless Connections (Bluetooth®, Wireless LAN/Wi-Fi)": {
                "page_range": [284, 287],
                "subsections": {
                    "Smartphones and Tablets: FUJIFILM Camera Remote": [284, 286],
                    "Copying Pictures to a Computer: PC AutoSave": [286, 287]
                }
            },
            "Connecting via USB": {
                "page_range": [287, 295],
                "subsections": {
                    "Connecting to Computers": [287, 290],
                    "Copying Pictures to a Computer": [290, 291],
                    "Using Your Camera as a Webcam (FUJIFILM X Webcam)": [290, 291],
                    "Converting RAW Images: FUJIFILM X RAW STUDIO": [291, 292],
                    "Backing Up and Restoring Camera Settings (FUJIFILM X Acquire)": [291, 292],
                    "Connecting to Smartphones": [292, 295]
                }
            },
            "instax SHARE Printers": {
                "page_range": [295, 297],
                "subsections": {
                    "Establishing a Connection": [295, 296],
                    "Printing Pictures": [296, 297]
                }
            }
        }
    },
    "Technical Notes": {
        "page_range": [
            297,
            335
        ],
        "sections": {
            "Accessories from Fujifilm": {
                "page_range": [298, 301],
                "subsections": {}
            },
            "Software for Use with Your Camera": {
                "page_range": [301, 303],
                "subsections": {
                    "FUJIFILM Camera Remote": [301, 302],
                    "FUJIFILM PC AutoSave": [301, 302],
                    "RAW FILE CONVERTER EX powered by SILKYPIX": [301, 302],
                    "Capture One Express Fujifilm": [301, 302],
                    "Capture One Pro Fujifilm": [302, 303],
                    "FUJIFILM X Acquire": [302, 303],
                    "FUJIFILM X RAW STUDIO": [302, 303],
                    "FUJIFILM X Webcam": [302, 303]
                }
            },
            "For Your Safety": {
                "page_range": [303, 312],
                "subsections": {}
            },
            "Product Care": {
                "page_range": [312, 313],
                "subsections": {}
            },
            "Cleaning the Image Sensor": {
                "page_range": [313, 314],
                "subsections": {}
            },
            "Firmware Updates": {
                "page_range": [314, 315],
                "subsections": {
                    "Checking the Firmware Version": [314, 315]
                }
            },
            "Troubleshooting": {
                "page_range": [315, 324],
                "subsections": {}
            },
            "Warning Messages and Displays": {
                "page_range": [324, 328],
                "subsections": {}
            },
            "Memory Card Capacity": {
                "page_range": [328, 329],
                "subsections": {}
            },
            "Specifications": {
                "page_range": [329, 335],
                "subsections": {}
            }
        }
    }
}

In [189]:
chapter_dict = {"chapter":{}}
for chapter in x_e4_contents.keys():
    chapter_dict["chapter"][chapter] = x_e4_contents[chapter]["page_range"]

In [190]:
cover_page_list = []
for chapter in chapter_dict["chapter"]:
    cover_page_list.append(chapter_dict["chapter"][chapter][0])

cover_erase_documents = []
for document in documents:
    if document.metadata['page'] not in cover_page_list:
        cover_erase_documents.append(document)

In [191]:
print(cover_page_list)

[1, 27, 41, 47, 55, 103, 181, 207, 243, 259, 281, 297]


In [192]:
for doc in cover_erase_documents:
    if "MEMO" in doc.parsing_result:
        print(doc.metadata['page'])

19
26
40
54
102
180
197
206
258
280
324
325
335


In [193]:
memo_page_list = [26, 40, 54, 102, 180, 206, 258, 280, 335, 336]

memo_erase_documents = []
for document in cover_erase_documents:
    if document.metadata['page'] not in memo_page_list:
        memo_erase_documents.append(document)

# add metadata
- metadata: {
    "chapter":,
    "section":,
    "subsection":
}

In [194]:
memo_erase_documents

[CameraDocument(metadata={'page': 2, 'model': 'x-e4'}, parsing_result='\n# Parts of the Camera\n\nThe parts of the camera are listed below.\n\n!Camera Diagram\n\n1. **Shutter button** .................................................. 44\n2. **Exposure compensation dial** .................. 4, 80\n3. **Q (quick menu) button** ............................. 247\n4. **Shutter speed dial** .................................... 4, 56, 59, 63, 65\n5. **Hot shoe** ..................................................... 267\n6. **Microphone** ............................................... 50, 175\n7. **Strap eyelet** ................................................. 28\n8. **Connector cover**\n9. **Lens signal contacts**\n10. **Lens release button** .................................. 29\n11. **AF-assist illuminator** ................................. 131\n    - Self-timer lamp ........................................... 141\n    - Tally light .................................................... 

In [195]:
for document in memo_erase_documents:
    document.metadata['subsection'] = []

    for chapter in x_e4_contents.keys():
        chapter_page_range = x_e4_contents[chapter]["page_range"]
        if chapter_page_range[0] <= document.metadata['page'] < chapter_page_range[1]:
            document.metadata['chapter'] = chapter

        for section in x_e4_contents[chapter]["sections"].keys():
            section_page_range = x_e4_contents[chapter]["sections"][section]['page_range']
            if section_page_range[0] <= document.metadata['page'] < section_page_range[1]:
                document.metadata['section'] = section

            if "subsections" in x_e4_contents[chapter]["sections"][section]:
                for subsection in x_e4_contents[chapter]["sections"][section]["subsections"].keys():
                    subsection_page_range = x_e4_contents[chapter]["sections"][section]['subsections'][subsection]                  
                    if subsection_page_range[0] <= document.metadata['page'] < subsection_page_range[1]:
                        document.metadata['subsection'].append(subsection)
            else:
                pass

In [196]:
memo_erase_documents

[CameraDocument(metadata={'page': 2, 'model': 'x-e4', 'subsection': [], 'chapter': 'Before You Begin', 'section': 'Parts of the Camera'}, parsing_result='\n# Parts of the Camera\n\nThe parts of the camera are listed below.\n\n!Camera Diagram\n\n1. **Shutter button** .................................................. 44\n2. **Exposure compensation dial** .................. 4, 80\n3. **Q (quick menu) button** ............................. 247\n4. **Shutter speed dial** .................................... 4, 56, 59, 63, 65\n5. **Hot shoe** ..................................................... 267\n6. **Microphone** ............................................... 50, 175\n7. **Strap eyelet** ................................................. 28\n8. **Connector cover**\n9. **Lens signal contacts**\n10. **Lens release button** .................................. 29\n11. **AF-assist illuminator** ................................. 131\n    - Self-timer lamp .....................................

In [197]:
for document in memo_erase_documents:
    if not document.metadata['subsection']:
        print(document.metadata['page'])

2
3
28
29
30
31
33
34
35
36
37
38
42
43
44
45
46
52
53
68
77
80
82
84
85
87
92
93
94
95
96
97
98
99
100
101
182
185
231
244
266
278
279
298
299
300
303
304
305
306
307
308
309
310
311
312
313
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334


In [198]:
for document in memo_erase_documents:
    if not document.metadata['subsection']:
        document.metadata['subsection'].append(document.metadata['section'])

In [199]:
camera_model = "x-e4"
json_dir = "./data/json"

output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for doc in memo_erase_documents:
    output_path = os.path.join(output_dir, f"{camera_model}_page{doc.metadata['page']}.json")
    doc.save_json(output_path)

./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page2.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page3.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page4.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page5.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page6.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page7.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page8.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page9.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page10.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page11.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page12.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_page13.json complete
./data/json/x-e4/LlamaParseMultimodal/processed_data/x-e4_pa

# Image Extract

In [149]:
from glob import glob
import re
import os

import sys
sys.path.append('../') 
from CameraDocument import CameraDocument

camera_model = 'x-e4'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)

documents = []
for i, path in enumerate(json_list):
    document = CameraDocument()
    document.load_json(path)
    documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 2, 'model': 'x-e4', 'chapter': 'Before You Begin', 'section': 'Parts of the Camera', 'subsection': 'Parts of the Camera'}, parsing_result='\n# Parts of the Camera\n\nThe parts of the camera are listed below.\n\n!Camera Diagram\n\n1. **Shutter button** .................................................. 44\n2. **Exposure compensation dial** .................. 4, 80\n3. **Q (quick menu) button** ............................. 247\n4. **Shutter speed dial** .................................... 4, 56, 59, 63, 65\n5. **Hot shoe** ..................................................... 267\n6. **Microphone** ............................................... 50, 175\n7. **Strap eyelet** ................................................. 28\n8. **Connector cover**\n9. **Lens signal contacts**\n10. **Lens release button** .................................. 29\n11. **AF-assist illuminator** ................................. 131\n    - Self-timer lamp ..................

In [150]:
import fitz

raw_data_dir = "./data/raw_data"
image_dir = "./data/image"
output_dir = os.path.join(image_dir, camera_model, "pdf_to_image")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

dpi = 600  # 원하는 DPI 값 (300, 600 등)
for document in documents:
    page_num = document.metadata['page']
    pdf_path = os.path.join(raw_data_dir, camera_model, f"{camera_model}_split", f"{camera_model}_page{page_num+24}.pdf")

    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(0)
    pix = page.get_pixmap(dpi=dpi)

    output_file = os.path.join(output_dir, f"{camera_model}_page{page_num}.png")
    pix.save(output_file)
    print(f"Saved: {output_file}")

    pdf_document.close()  

Saved: ./data/image/x-e4/pdf_to_image/x-e4_page2.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page3.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page4.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page5.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page6.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page7.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page8.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page9.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page10.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page11.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page12.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page13.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page14.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page15.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page16.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page17.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page18.png
Saved: ./data/image/x-e4/pdf_to_image/x-e4_page19.png
Saved: ./data/image/x-e4/pdf_to_imag

In [156]:
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

True

In [154]:
from glob import glob
import re

def extract_number(file_path):
    match = re.search(r'(\d+)\.png$', file_path)
    return int(match.group(1)) if match else None

camera_model = "x-e4"
image_dir = "./data/image"
image_path = os.path.join(image_dir, camera_model, "pdf_to_image", "*.png")

images = glob(image_path)
images = sorted(images, key=extract_number)

In [157]:
import requests
import json

json_dir = f"./data/json/{camera_model}/upstage"
if not os.path.exists(json_dir):
    os.makedirs(json_dir)

api_key = os.getenv("UPSTAGE_API_KEY")
url = "https://api.upstage.ai/v1/document-ai/document-parse"
headers = {"Authorization": f"Bearer {api_key}"}

for image_path in images:
    filename = image_path
    files = {"document": open(filename, "rb")}
    data = {"output_formats":"['html','markdown']"}
    response_test = requests.post(url, headers=headers, files=files, data=data)

    json_path = os.path.join(json_dir, os.path.splitext(os.path.basename(image_path))[0] +".json")
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(response_test.json(), file, ensure_ascii=False, indent=4)

In [164]:
from glob import glob
import re
import os

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

camera_model = "x-e4"
json_dir = "./data/json"
json_path = os.path.join(json_dir, camera_model, "upstage", "*.json")

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)

In [165]:
import json
from PIL import Image

image_dir = "./data/image"
pdf_image_dir = os.path.join(image_dir, camera_model, "pdf_to_image")
output_dir = os.path.join(image_dir, camera_model, "extracted_images")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for json_path in json_list:
    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    filtered_elements = []
    for element in json_data['elements']:
        if element.get('category') == 'figure':
            filtered_elements.append(element.get('coordinates'))

    image_path = os.path.join(pdf_image_dir, os.path.splitext(os.path.basename(json_path))[0]+'.png')
    for idx, coordinate in enumerate(filtered_elements):
        try:
            with Image.open(image_path) as img:
                width, height = img.size

                # 바운딩 박스 좌표 계산
                x_min = int(min(coord['x'] * width for coord in coordinate))
                y_min = int(min(coord['y'] * height for coord in coordinate))
                x_max = int(max(coord['x'] * width for coord in coordinate))
                y_max = int(max(coord['y'] * height for coord in coordinate))

                # 바운딩 박스 크롭
                cropped_image = img.crop((x_min, y_min, x_max, y_max))

                # 결과 이미지 저장
                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(json_path))[0]+f"_image{idx+1}.png")
                cropped_image.save(output_path)
                print(f"Cropped image saved to: {output_path}")

        except FileNotFoundError:
            print(f"File not found: {image_path}")

Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page2_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page3_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page4_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page4_image2.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page5_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page5_image2.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page6_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page7_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page8_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page8_image2.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page9_image1.png
Cropped image saved to: ./data/image/x-e4/extracted_images/x-e4_page10_image1.png
Cropped image saved to: ./d