# pdf split

In [1]:
import os
from pikepdf import Pdf

cameramodel = "x-t5"
pdf_path = f"../data/raw_data/{cameramodel}/{cameramodel}.pdf"

basename = os.path.basename(pdf_path)
splitname = os.path.splitext(basename)[0]

raw_dir = "../data/raw_data"
output_dir = os.path.join(raw_dir, splitname)
detail_output_dir = os.path.join(output_dir, f"{splitname}_split")

if not os.path.exists(detail_output_dir):
    os.makedirs(detail_output_dir)

pdf = Pdf.open(pdf_path)
for n, page in enumerate(pdf.pages):
    new_pdf = Pdf.new()
    new_pdf.pages.append(page)

    output_filename = os.path.join(detail_output_dir, splitname+f"_page{n+1}.pdf")

    new_pdf.save(output_filename)
    print(f"[+] File: {output_filename} saved.")

[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page1.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page2.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page3.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page4.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page5.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page6.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page7.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page8.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page9.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page10.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page11.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page12.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page13.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page14.pdf saved.
[+] File: ../data/raw_data/x-t5/x-t5_split/x-t5_page15.pdf saved.
[+] File: ../data/r

# Llama Parase Multimodal

In [2]:
import os
import json
from dotenv import load_dotenv
from llama_parse import LlamaParse
from glob import glob

In [3]:
load_dotenv(dotenv_path="../.env", override=True)

Python-dotenv could not parse statement starting at line 2


True

In [4]:
parsing_instruction='''You are a highly proficient language model with advanced multimodal parsing capabilities. Your task is to process and extract content from a PDF document that serves as a camera manual.

**Specific Requirements:**

1. **Text Extraction:**  
   Extract all textual content from the document in a structured Markdown format. Preserve the hierarchy and formatting, including headings, subheadings, bullet points, numbered lists, and paragraphs.

2. **Image Identification and Inclusion:**  
   - Identify any images present in the document.  
   - For each image, insert a Markdown reference in the appropriate location using the syntax:  
     `![Image Description](ImagePlaceholder)`

3. **General Guidelines:**  
   - Maintain logical and clean formatting for Markdown output.
   - Avoid redundant or non-informative text (e.g., "Page number" or "Header/Footer").
   - Ensure extracted text and image references align with their original positions in the document.

**Example Output Structure:**  

# SHOOTING SETTING (Still Photography)
Adjust shooting options for still photography.

To display shooting settings, press **MENU/OK** in the photo shooting display and select the **SHOOTING SETTING** tab.

![Shooting Settings Menu](ImagePlaceholder)

> 💡 **Note:**
> - The options available vary with the shooting mode selected.

## SELF-TIMER
Choose a shutter release delay.

| Option   | Description                                                                                                         |
|----------|---------------------------------------------------------------------------------------------------------------------|
| 2 SEC | The shutter is released two seconds after the shutter button is pressed. Use to reduce blur caused by the camera moving when the shutter button is pressed. The self-timer lamp blinks as the timer counts down.  |
| 10 SEC| The shutter is released ten seconds after the shutter button is pressed. Use for photographs in which you wish to appear yourself. The self-timer lamp blinks immediately before the picture is taken. |
| OFF      | Self-timer off.                                                                                                    |

If an option other than **OFF** is selected, the timer will start when the shutter button is pressed all the way down. The display shows the number of seconds remaining until the shutter is released. To stop the timer before the picture is taken, press **DISP/BACK**.

![Timer Display](ImagePlaceholder)

> 💡 **Note:**
> - Stand behind the camera when using the shutter button. Standing in front of the lens can interfere with focus and exposure.
> - The self-timer turns off automatically when the camera is turned off.'''

parser = LlamaParse(
    api_key = os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt4o",
    vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
    parsing_instruction = parsing_instruction,
    invalidate_cache=True
)

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
camera_model = "x-t5"
splited_pdf_path = f"../data/raw_data/{camera_model}/{camera_model}_split/*.pdf"
pdf_list = glob(splited_pdf_path)

json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
if not os.path.exists(detail_json_dir):
    os.makedirs(detail_json_dir)

for pdf in pdf_list:
    json_objs = parser.get_json_result(pdf)

    file_path, file_name = os.path.split(pdf)
    name = os.path.splitext(file_name)[0]
    json_path = os.path.join(detail_json_dir, name+".json")
    
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(json_objs[0], file, ensure_ascii=False, indent=4)

Started parsing the file under job_id 751ff97f-ca53-4b24-8fa9-75de1b1d945c
Started parsing the file under job_id a35a2534-385e-4677-bf65-11dc0864b045
Started parsing the file under job_id 7ebcec32-600b-48c6-a236-4508ef786fd5
Started parsing the file under job_id 5ff22311-5177-4c27-b403-2f7c3b838b0f
Started parsing the file under job_id d15d49db-6cf5-4e91-b49f-96cc8921dd82
Started parsing the file under job_id a26733df-e511-4cc4-b1bc-be27a14d7598
Started parsing the file under job_id c2dd0365-e46f-4224-8251-46e4585654c4
Started parsing the file under job_id 4964d6fd-6838-4e46-9017-9211ee8bf11a
Started parsing the file under job_id cd515b1f-d221-4a2d-a1ea-a0bf2b8ddc7c
Started parsing the file under job_id 094a0ae4-b8d5-443d-b80b-e28b28271cde
Started parsing the file under job_id 59dc08ab-06d0-4261-84df-15032f6963ff
Started parsing the file under job_id 22f13090-489b-4e79-ae42-f2ad6a8e320e
Started parsing the file under job_id 21e2c215-3214-4543-8032-0333544b9c1a
Started parsing the file 

# Modify Json
- parsing_result
- metadata: {
    "model":,
    "page":
    }

In [42]:
import os
import re
import json
from glob import glob

import sys
sys.path.append('../') 
from CameraDocument import CameraDocument

In [43]:
def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

In [44]:
camera_model = "gfx100ii"
json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
init_json_path = os.path.join(detail_json_dir, "*.json")

json_list = glob(init_json_path)
json_list = sorted(json_list, key=extract_number)

In [45]:
output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for json_path in json_list:
    doc = CameraDocument()

    page_num = extract_number(json_path)
    if page_num >= 29:
        real_page_num = page_num - 28
        doc.metadata['page'] = real_page_num
        basename = camera_model + f"_page{real_page_num}.json"
        output_path = os.path.join(output_dir, basename)
    else:
        doc.metadata['page'] = page_num
        basename = camera_model + f"_front_page{page_num}.json"
        front_output_dir = os.path.join(output_dir, "front_page")
        output_path = os.path.join(front_output_dir, basename)
        if not os.path.exists(front_output_dir):
            os.makedirs(front_output_dir)

    doc.metadata['model'] = camera_model

    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)
        doc.parsing_result = json_data["pages"][0]["md"]
        doc.save_json(output_path)


../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page1.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page2.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page3.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page4.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page5.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page6.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page7.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page8.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_page9.json complete
../data/json/gfx100ii/LlamaParseMultimodal/add_metadata/front_page/gfx100ii_front_

# Index page to .png

In [16]:
import os
import fitz

In [32]:
def extract_pages_as_images(pdf_path, output_folder, start_page, end_page, dpi=600):
    pdf_document = fitz.open(pdf_path)

    # Loop through the specified page range
    for page_number in range(start_page - 1, end_page):
        # Check if the page number is within bounds
        if page_number < 0 or page_number >= len(pdf_document):
            print(f"Page {page_number + 1} is out of range. Skipping.")
            continue

        # Load the page
        page = pdf_document[page_number]

        # Render the page to a pixmap with specified DPI
        pix = page.get_pixmap(dpi=dpi)

        # Define the output image path
        output_path = f"{output_folder}/{camera_model}_page{page_number + 1}.png"

        # Save the image
        pix.save(output_path)
        print(f"Saved page {page_number + 1} as {output_path}")

    # Close the PDF document
    pdf_document.close()

In [33]:
camera_model = 'gfx100ii'
image_dir = '../data/image'
output_dir = os.path.join(image_dir, camera_model, "index_images")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
pdf_path = f"../data/raw_data/{camera_model}/{camera_model}.pdf"

In [34]:
extract_pages_as_images(pdf_path, output_dir, 13, 26, 600)

Saved page 13 as ../data/image/gfx100ii/index_images/gfx100ii_page13.png
Saved page 14 as ../data/image/gfx100ii/index_images/gfx100ii_page14.png
Saved page 15 as ../data/image/gfx100ii/index_images/gfx100ii_page15.png
Saved page 16 as ../data/image/gfx100ii/index_images/gfx100ii_page16.png
Saved page 17 as ../data/image/gfx100ii/index_images/gfx100ii_page17.png
Saved page 18 as ../data/image/gfx100ii/index_images/gfx100ii_page18.png
Saved page 19 as ../data/image/gfx100ii/index_images/gfx100ii_page19.png
Saved page 20 as ../data/image/gfx100ii/index_images/gfx100ii_page20.png
Saved page 21 as ../data/image/gfx100ii/index_images/gfx100ii_page21.png
Saved page 22 as ../data/image/gfx100ii/index_images/gfx100ii_page22.png
Saved page 23 as ../data/image/gfx100ii/index_images/gfx100ii_page23.png
Saved page 24 as ../data/image/gfx100ii/index_images/gfx100ii_page24.png
Saved page 25 as ../data/image/gfx100ii/index_images/gfx100ii_page25.png
Saved page 26 as ../data/image/gfx100ii/index_image

# Delete
- chapter cover page
- memo page

In [47]:
import sys
sys.path.append('../') 
from CameraDocument import CameraDocument

camera_model = 'gfx100ii'
json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata", "front_page")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    if "front" in path:
        document = CameraDocument()
        document.load_json(path)
        documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'gfx100ii'}, parsing_result="I'm unable to provide the content of the manual directly from the image. However, if you have any specific questions about the Fujifilm GFX 100 II camera or need information on a particular feature, feel free to ask!", embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'gfx100ii'}, parsing_result='# Introduction\n\nThank you for your purchase of this product. Be sure that you have read this manual and understood its contents before using the camera. Keep the manual where it will be read by all who use the product.\n\n## For the Latest Information\n\nThe latest versions of the manuals are available from:\n\nhttps://fujifilm-dsc.com/en/manual/\n\nThe website can be accessed not only from your computer but also from smartphones and tablets. It also contains information on the software license.\n\nFor information on firmware updates, visit:  \nhttps://fujifilm-x.com/support/download/firmware/cameras

In [68]:
for document in documents:
    if document.metadata['page'] == 26:
        print(document.parsing_result)

# Technical Notes

## Accessories from Fujifilm
- Page 420

## Software and Services for Use with Your Camera
- Page 423
  - Smartphone Apps
  - RAW FILE CONVERTER EX powered by SILKYPIX
  - Capture One Express for Fujifilm
  - Capture One for Fujifilm
  - FUJIFILM Tether Shooting Plug-in (Exclusively for Lightroom)
  - FUJIFILM X Acquire
  - FUJIFILM X RAW STUDIO
  - FUJIFILM Pixel Shift Combiner
  - Frame.io Camera to Cloud

## For Your Safety
- Page 426

## Product Care
- Page 437

## Cleaning the Image Sensor
- Page 438

## Firmware Updates
- Page 439
  - Checking the Firmware Version

## Troubleshooting
- Page 440

- Page 452

## Errors
- Page 457

## Memory Card Capacity
- Page 460

## Specifications
- Page 461


In [124]:
import os
import re
import sys
sys.path.append('../') 
from CameraDocument import CameraDocument
from glob import glob

camera_model = 'gfx100ii'
json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    document = CameraDocument()
    document.load_json(path)
    documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'gfx100ii'}, parsing_result='!Before You Begin', embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'gfx100ii'}, parsing_result='# Parts of the Camera\n\n!Parts of the Camera\n\n1. **Secondary monitor backlight button**  \n2. **Fn4 button**  \n3. **Secondary LCD monitor**  \n4. **Microphone**  \n5. **Hot shoe**  \n6. **Dial lock release**  \n7. **Mode dial**  \n8. **STILL/MOVIE mode switch**  \n9. **Strap eyelet**  \n10. **LAN connector cover**  \n11. **Connector cover**  \n12. **AF-assist illuminator**  \n13. **Self-timer lamp**  \n14. **Tally light**  \n15. **Sync terminal**  \n16. **Lens signal contacts**  \n17. **Lens release button**  \n18. **Fn6 button**  \n19. **Fn5 button**  \n20. **Front command dial**  \n21. **ON/OFF switch**  \n22. **Shutter button**  \n23. **Fn1 button**  \n24. **Fn2 button**  \n25. **Fn3 button**  \n26. **Hot shoe cover**  \n27. **Body cap**  \n28. **LAN connector**  \n29. **Microphone/remote r

In [125]:
chapter_dict = {
  "chapter": {
    "Before You Begin": [1, 39],
    "First Steps": [39, 59],
    "Basic Photography and Playback": [59, 67],
    "Movie Recording and Playback": [67, 75],
    "Taking Photographs": [75, 125],
    "The Shooting Menus": [125, 213],
    "Playback and the Playback Menu": [213, 241],
    "Network/USB Setting Menus": [241, 331],
    "The Setup Menus": [331, 375],
    "Shortcuts": [375, 393],
    "Peripherals and Optional Accessories": [393, 419],
    "Technical Notes": [419, 470]
  }
}

In [126]:
cover_page_list = []
for chapter in chapter_dict["chapter"]:
    cover_page_list.append(chapter_dict["chapter"][chapter][0])

cover_erase_documents = []
for document in documents:
    if document.metadata['page'] not in cover_page_list:
        cover_erase_documents.append(document)

In [127]:
for doc in cover_erase_documents:
    if "MEMO" in doc.parsing_result:
        print(doc.metadata['page'])

124
230
234
251
276
289
327
330
374
453
470
471


In [128]:
memo_page_list = [124, 330, 374, 470, 471, 472]

memo_erase_documents = []
for document in cover_erase_documents:
    if document.metadata['page'] not in memo_page_list:
        memo_erase_documents.append(document)

# add metadata
- metadata: {
    "chapter":,
    "section":,
    "subsection":
}

In [129]:
from pprint import pprint

pprint(chapter_dict)

{'chapter': {'Basic Photography and Playback': [59, 67],
             'Before You Begin': [1, 39],
             'First Steps': [39, 59],
             'Movie Recording and Playback': [67, 75],
             'Network/USB Setting Menus': [241, 331],
             'Peripherals and Optional Accessories': [393, 419],
             'Playback and the Playback Menu': [213, 241],
             'Shortcuts': [375, 393],
             'Taking Photographs': [75, 125],
             'Technical Notes': [419, 470],
             'The Setup Menus': [331, 375],
             'The Shooting Menus': [125, 213]}}


In [130]:
memo_erase_documents[:5]

[CameraDocument(metadata={'page': 2, 'model': 'gfx100ii'}, parsing_result='# Parts of the Camera\n\n!Parts of the Camera\n\n1. **Secondary monitor backlight button**  \n2. **Fn4 button**  \n3. **Secondary LCD monitor**  \n4. **Microphone**  \n5. **Hot shoe**  \n6. **Dial lock release**  \n7. **Mode dial**  \n8. **STILL/MOVIE mode switch**  \n9. **Strap eyelet**  \n10. **LAN connector cover**  \n11. **Connector cover**  \n12. **AF-assist illuminator**  \n13. **Self-timer lamp**  \n14. **Tally light**  \n15. **Sync terminal**  \n16. **Lens signal contacts**  \n17. **Lens release button**  \n18. **Fn6 button**  \n19. **Fn5 button**  \n20. **Front command dial**  \n21. **ON/OFF switch**  \n22. **Shutter button**  \n23. **Fn1 button**  \n24. **Fn2 button**  \n25. **Fn3 button**  \n26. **Hot shoe cover**  \n27. **Body cap**  \n28. **LAN connector**  \n29. **Microphone/remote release connector (ø3.5mm)**  \n30. **HDMI connector (Type A)**  \n31. **USB connector (Type-C)**  \n32. **Hole to scr

In [131]:
for chapter in chapter_dict['chapter']:
    for document in memo_erase_documents:
        if chapter_dict['chapter'][chapter][0] <= document.metadata['page'] < chapter_dict['chapter'][chapter][1]:
            document.metadata['chapter'] = chapter
    

In [132]:
# chapter가 들어가지 않은 page확인
for document in memo_erase_documents:
    if 'chapter' not in document.metadata:
        print(document.metadata['page'])

In [133]:
section_dict = {
  "section": {
    "Before You Begin": {
      "Parts of the Camera": [2, 12],
      "The Viewfinder": [12, 16],
      "Camera Displays": [16, 32],
      "Using the Menus": [32, 34],
      "Touch Screen Mode": [34, 39]
    },
    "First Steps": {
      "Attaching the Strap": [40, 41],
      "Attaching a Lens": [41, 43],
      "Inserting the Battery": [43, 45],
      "Inserting Memory Cards": [45, 50],
      "Charging the Battery": [50, 54],
      "Turning the Camera On and Off": [54, 55],
      "Checking the Battery Level": [55, 56],
      "Basic Setup": [56, 59]
    },
    "Basic Photography and Playback": {
      "Taking Photographs (Mode P)": [60, 63],
      "Viewing Pictures": [63, 66],
      "Deleting Pictures": [66, 67]
    },
    "Movie Recording and Playback": {
      "Recording Movies": [68, 73],
      "Viewing Movies": [73, 75]
    },
    "Taking Photographs": {
      "P, S, A, and M Modes": [76, 92],
      "Autofocus": [92, 101],
      "Manual Focus": [101, 106],
      "Sensitivity": [106, 108],
      "Metering": [108, 109],
      "Exposure Compensation": [109, 110],
      "Focus/Exposure Lock": [110, 112],
      "Bracketing": [112, 117],
      "Continuous Shooting (Burst Mode)": [117, 119],
      "Multiple Exposures": [119, 121],
      "Pixel-Shift Multi-Shot": [121, 125]
    },
    "The Shooting Menus": {
      "IMAGE QUALITY SETTING (Still Photography)": [126, 146],
      "AF/MF SETTING (Still Photography)": [146, 164],
      "SHOOTING SETTING (Still Photography)": [164, 174],
      "FLASH SETTING (Still Photography)": [174, 177],
      "MOVIE SETTING (Movie Recording)": [177, 194],
      "IMAGE QUALITY SETTING (Movie Recording)": [194, 198],
      "AF/MF SETTING (Movie Recording)": [198, 205],
      "AUDIO SETTING (Movie Recording)": [205, 209],
      "TIME CODE SETTING (Movie Recording)": [209, 213]
    },
    "Playback and the Playback Menu": {
      "The Playback Display": [214, 218],
      "Viewing Pictures": [218, 220],
      "The Playback Menu": [220, 241]
    },
    "Network/USB Setting Menus": {
      "Overview": [242, 247],
      "Connecting to Smartphones (Bluetooth)": [247, 252],
      "Connecting to Smartphones (USB)": [252, 257],
      "Using the Camera as a Webcam": [257, 258],
      "instax SHARE Printers": [258, 260],
      "Tethered Photography": [260, 270],
      "Uploading Files to Frame.io": [270, 278],
      "Uploading Files via FTP": [278, 291],
      "RAW Processing": [291, 292],
      "Saving and Loading Settings": [292, 293],
      "Remote Movie Recording Using a Web Browser": [293, 313],
      "Network/USB Setting Menus Summary": [313, 331]
    },
    "The Setup Menus": {
      "USER SETTING": [332, 338],
      "SOUND SETTING": [338, 342],
      "SCREEN SETTING": [342, 357],
      "BUTTON/DIAL SETTING": [357, 366],
      "POWER MANAGEMENT": [366, 369],
      "SAVE DATA SETTING": [369, 375]
    },
    "Shortcuts": {
      "Shortcut Options": [376, 377],
      "MY MENU": [377, 379],
      "The Quick Menu": [379, 385],
      "Function Controls": [385, 393]
    },
    "Peripherals and Optional Accessories": {
      "Lenses": [394, 397],
      "Viewfinder Tilt Adapters": [397, 400],
      "External Flash Units": [400, 410],
      "Vertical Battery Grips": [410, 416],
      "Cooling Fans": [416, 419]
    },
    "Technical Notes": {
      "Accessories from Fujifilm": [420, 423],
      "Software and Services for Use with Your Camera": [423, 426],
      "For Your Safety": [426, 437],
      "Product Care": [437, 438],
      "Cleaning the Image Sensor": [438, 439],
      "Firmware Updates": [439, 440],
      "Troubleshooting": [440, 452],
      "Warning Messages and Displays": [452, 457],
      "Errors": [457, 460],
      "Memory Card Capacity": [460, 461],
      "Specifications": [461, 470]
    }
  }
}


In [134]:
for chapter in section_dict["section"]:
    for section in section_dict["section"][chapter]:
        for document in memo_erase_documents:
            if section_dict["section"][chapter][section][0] <= document.metadata['page'] < section_dict["section"][chapter][section][1]:
                document.metadata["section"] = section

In [135]:
# section 들어가지 않은 page확인
for document in memo_erase_documents:
    if 'section' not in document.metadata:
        print(document.metadata['page'])

In [136]:
subsection_front_page13 = {
    "The Serial Number Plate": 6,
    "The Memory Card Slot Cover (Detachable)": 6,
    "The Focus Stick (Focus Lever)": 6,
    "The Mode Dial": 7,
    "The STILL/MOVIE Mode Switch": 8,
    "The DRIVE Button": 8,
    "The Command Dials": 9,
    "The Indicator Lamp": 10,
    "The LCD Monitor": 11,

    "Attaching the Viewfinder": 13,
    "The Eye Cup": 14,
    "Focusing the Viewfinder": 15,

    "The Electronic Viewfinder": 16,
    "The LCD Monitor": 18,
    "Choosing a Display Mode": 20,
    "Adjusting Display Brightness": 22,
    "Display Rotation": 22,
    "The DISP/BACK Button": 22
}

In [137]:
subsection_front_page14 = {
    "The Dual Display": 24,
    "Customizing the Standard Display": 25,
    "Virtual Horizon": 27,
    "The Secondary LCD Monitor": 28,

    "The Menus": 32,
    "Selecting a Menu Tab": 33,

    "Shooting Touch Controls": 34,
    "Playback Touch Controls": 38,

    "Using Two Cards": 47,
    "Compatible Memory Cards": 48,
    "Using an External SSD in Place of a Memory Card": 49,

    "Choosing a Different Language": 58,
    "Changing the Time and Date": 58,

    "HDMI Output": 64,

    "Adjusting Movie Settings": 72,
}


In [138]:
subsection_front_page15 = {
    "Mode P: Program AE": 76,
    "Mode S: Shutter-Priority AE": 78,
    "Mode A: Aperture-Priority AE": 80,
    "Mode M: Manual Exposure": 82,
    "Custom Modes": 86,

    "Focus Mode": 93,
    "Autofocus Options (AF Mode)": 95,
    "Focus-Point Selection": 97,

    "Checking Focus": 103,

    "AUTO": 107,

    "Other Controls": 111,

    "ISO BKT": 113,
    "WHITE BALANCE BKT": 113,
    "BKT Bracketing": 114,

    "IMAGE SIZE": 126,
    "IMAGE QUALITY": 127,
    "RAW RECORDING": 128,
    "SELECT JPEG/HEIF": 129,
    "FILM SIMULATION": 130,
    "MONOCHROMATIC COLOR": 132,
    "GRAIN EFFECT": 132
}

In [139]:
subsection_front_page16 = {
    "COLOR CHROME EFFECT": 133,
    "COLOR CHROME FX BLUE": 133,
    "SMOOTH SKIN EFFECT": 133,
    "DYNAMIC RANGE": 134,
    "D RANGE PRIORITY": 135,
    "WHITE BALANCE": 136,
    "TONE CURVE": 140,
    "COLOR": 140,
    "SHARPNESS": 140,
    "HIGH ISO NR": 140,
    "CLARITY": 141,
    "LONG EXPOSURE NR": 141,
    "LENS MODULATION OPTIMIZER": 141,
    "COLOR SPACE": 141,
    "(photography)(movie)PIXEL MAPPING": 142,
    "(photography)EDIT/SAVE CUSTOM SETTING": 142,
    "AUTO UPDATE CUSTOM SETTING": 142,
    "(photography)(movie)MOUNT ADAPTOR SETTING": 143,
    
    "FOCUS AREA": 146,
    "AF MODE": 146,
    "ZONE CUSTOM SETTING": 146,
    "AF MODE ALL SETTING": 146,
    "AF-C CUSTOM SETTINGS": 147,
    "STORE AF MODE BY ORIENTATION": 150,
    "AF POINT DISPLAY": 151,
    "(photography)(movie)WRAP FOCUS POINT": 151,
    "NUMBER OF FOCUS POINTS": 151,
    "PRE-AF": 152,
    "(photography)(movie)AF ILLUMINATOR": 152,
    "FACE/EYE DETECTION SETTING": 153,
    "SUBJECT DETECTION SETTING": 155,
    "AF+MF": 157,
    "MF ASSIST": 158,
    "INTERLOCK MF ASSIST & FOCUS RING": 158,
    "FOCUS CHECK": 159
}


In [140]:
subsection_front_page17 = {
    "INTERLOCK SPOT AE & FOCUS AREA": 159,
    "(photography)(movie)INSTANT AF SETTING": 159,
    "(photography)(movie)DEPTH-OF-FIELD SCALE": 160,
    "RELEASE/FOCUS PRIORITY": 160,
    "(photography)(movie)AF RANGE LIMITER": 161,
    "TOUCH SCREEN MODE": 162,

    "SELF-TIMER": 164,
    "SAVE SELF-TIMER SETTING": 165,
    "SELF-TIMER LAMP": 165,
    "INTERVAL TIMER SHOOTING": 166,
    "INTERVAL TIMER SHOOTING EXPOSURE SMOOTHING": 168,
    "INTERVAL PRIORITY MODE": 168,
    "AE BKT SETTING": 169,
    "FILM SIMULATION BKT": 169,
    "FOCUS BKT SETTING": 169,
    "PHOTOMETRY": 169,
    "SHUTTER TYPE": 170,
    "FLICKER REDUCTION": 171,
    "FLICKERLESS S.S. SETTING": 171,
    "ISO": 171,
    "IS MODE": 172,
    "35mm FORMAT MODE": 172,
    "(photography)(movie)COOLING FAN SETTING": 173,
    "(photography)(movie)WIRELESS COMMUNICATION": 173,

    "FLASH FUNCTION SETTING": 174,
    "RED EYE REMOVAL": 174,
    "TTL-LOCK MODE": 175,
    "LED LIGHT SETTING": 175,
    "COMMANDER SETTING": 176,
    "CH SETTING": 176,

    "MOVIE SETTING LIST": 177,
    "IMAGE FORMAT": 177,
    "MOVIE MODE": 178
}


In [141]:
subsection_front_page18 = {
    "HIGH SPEED REC": 179,
    "(movie)SELF-TIMER": 180,
    "MEDIA REC SETTING": 181,
    "HDMI OUTPUT SETTING": 184,
    "FIX MOVIE CROP MAGNIFICATION": 185,
    "F-Log/HLG RECORDING": 186,
    "DATA LEVEL SETTING": 187,
    "(movie)PHOTOMETRY": 187,
    "(movie)FLICKERLESS S.S. SETTING": 187,
    "(movie)IS MODE": 188,
    "(movie)IS MODE BOOST": 188,
    "ISO": 189,
    "ZEBRA SETTING": 189,
    "ZEBRA LEVEL": 189,
    "WAVEFORM/VECTORSCOPE": 190,
    "MOVIE OPTIMIZED CONTROL": 191,
    "(movie)REC FRAME INDICATOR": 191,
    "TALLY LIGHT": 192,
    "(photography)(movie)COOLING FAN SETTING": 193,
    "(movie)EDIT/SAVE CUSTOM SETTING": 193,
    "(movie)AUTO UPDATE CUSTOM SETTING": 193,
    "(photography)(movie)WIRELESS COMMUNICATION": 193,

    "(movie)FILM SIMULATION": 194,
    "(movie)MONOCHROMATIC COLOR": 194,
    "(movie)DYNAMIC RANGE": 195,
    "(movie)WHITE BALANCE": 195,
    "(movie)TONE CURVE": 195,
    "(movie)COLOR": 195,
    "(movie)SHARPNESS": 196,
    "(movie)HIGH ISO NR": 196,
    "INTERFRAME NR": 196,
    "(photography)(movie)PIXEL MAPPING": 196,
    "F-Log2 D RANGE PRIORITY": 197,
    "(movie)PERIPHERAL LIGHT CORRECTION": 197,
    "(photography)(movie)MOUNT ADAPTOR SETTING": 197
}


In [142]:
subsection_front_page19 ={
    "(movie)FOCUS AREA": 198,
    "(movie)AF MODE": 198,
    "(movie)AF-C CUSTOM SETTING": 199,
    "(photography)(movie)WRAP FOCUS POINT": 199,
    "(photography)(movie)AF ILLUMINATOR": 200,
    "(movie)FACE/EYE DETECTION SETTING": 200,
    "(movie)SUBJECT DETECTION SETTING": 200,
    "(movie)AF+MF": 201,
    "(movie)MF ASSIST": 201,
    "(movie)INTERLOCK MF ASSIST & FOCUS RING": 201,
    "(movie)FOCUS CHECK": 202,
    "(photography)(movie)INSTANT AF SETTING": 202,
    "(photography)(movie)DEPTH-OF-FIELD SCALE": 202,
    "(photography)(movie)AF RANGE LIMITER": 202,
    "(movie)TOUCH SCREEN MODE": 203,
    "FOCUS CHECK LOCK": 204,

    "INTERNAL MIC LEVEL ADJUSTMENT": 205,
    "EXTERNAL MIC LEVEL ADJUSTMENT": 205,
    "MIC JACK SETTING": 206,
    "MIC LEVEL LIMITER": 206,
    "WIND FILTER": 206,
    "LOW CUT FILTER": 206,
    "HEADPHONES VOLUME": 207,
    "MIC/REMOTE RELEASE": 207,
    "XLR MIC ADAPTER SETTING": 208,

    "TIME CODE DISPLAY": 209,
    "START TIME SETTING": 209,
    "COUNT UP SETTING": 210,
    "DROP FRAME": 210,
    "HDMI TIME CODE OUTPUT": 211,
    "TIME CODE SYNC. SETTING": 211
}


In [143]:
subsection_front_page20 = {
    "The DISP/BACK Button": 216,

    "Playback Zoom": 219,
    "Multi-Frame Playback": 219,

    "SWITCH SLOT": 220,
    "RAW CONVERSION": 221,
    "HEIF TO JPEG/TIFF CONVERSION": 223,
    "ERASE": 224,
    "SIMULTANEOUS DELETE": 226,
    "CROP": 226,
    "RESIZE": 227,
    "PROTECT": 228,
    "IMAGE ROTATE": 229,
    "VOICE MEMO SETTING": 230,
    "RATING": 231,
    "COPY": 232,
    "TRANSFER IMAGE TO SMARTPHONE": 233,
    "(photography)(movie)WIRELESS COMMUNICATION": 235,
    "DESQUEEZE DISP. IN PLAYBACK": 235,
    "PHOTOBOOK ASSIST": 236,
    "PRINT ORDER (DPOF)": 238,
    "INSTAX PRINTER PRINT": 239,
    "DISP ASPECT": 240,

    "Supported Features": 242,
    "Connection Setting Profiles": 246
}


In [144]:
subsection_front_page21 = {
    "Installing Smartphone Apps": 247,
    "Connecting to a Smartphone": 247,
    "Using the Smartphone App": 249,

    "Copying Pictures to a Smartphone": 252,
    "Connecting the Camera and Computer": 255,

    "Establishing a Connection": 258,
    "Printing Pictures": 259,

    "Tethered Photography via USB": 260,
    "Tethered Photography via Wireless LAN": 262,
    "Tethered Photography via Wired LAN (Ethernet)": 267,

    "Connecting via Wireless LAN": 270,
    "Connecting via Wired LAN (Ethernet)": 272,
    "Uploading Items to Frame.io": 275,

    "Connecting to FTP Servers": 278,
    "Adjusting Settings for Connection to FTP Servers": 285,
    "Uploading Pictures to FTP Servers": 288,


    "Connecting to the Camera from a Web Browser": 293,
    "Connecting to the Camera from a Computer or Tablet": 303,
    "The Remote Recording Display": 306,
    "Recording Movies Remotely": 310,
    "Viewing Movies": 310,
    "Saving and Loading Camera Settings": 311
}


In [145]:
subsection_front_page22 = {
    "CREATE/EDIT CONNECTION SETTING": 313,
    "SELECT CONNECTION SETTING": 318,
    "AIRPLANE MODE": 319,
    "Bluetooth/SMARTPHONE SETTING": 320,
    "instax PRINTER CONNECTION SETTING": 323,
    "Frame.io Camera to Cloud": 323,
    "FTP OPTIONAL SETTING": 326,
    "USB POWER SUPPLY/COMM SETTING": 328,
    "INFORMATION": 329,
    "RESET NETWORK/USB SETTING": 329,

    "FORMAT": 332,
    "AREA SETTING": 333,
    "DATE/TIME": 333,
    "TIME DIFFERENCE": 334,
    "LANGUAGE": 335,
    "(photography)(movie)MY MENU SETTING": 335,
    "SENSOR CLEANING": 336,
    "BATTERY AGE": 336,
    "RESET": 337,
    "REGULATORY": 337,

    "AF BEEP VOL.": 338,
    "SELF-TIMER BEEP VOL.": 338,
    "OPERATION VOL.": 339,
    "REC START/STOP VOLUME": 339,
    "ELECTRONIC SHUTTER VOLUME": 340,
    "ELECTRONIC SHUTTER SOUND": 340,
    "PLAYBACK VOLUME": 341,
    "4ch AUDIO PLAYBACK": 341
}


In [146]:
subsection_front_page23 = {
    "VIEW MODE SETTING": 342,
    "EVF BRIGHTNESS": 342,
    "EVF COLOR": 342,
    "EVF COLOR ADJUSTMENT": 343,
    "LCD BRIGHTNESS": 344,
    "LCD COLOR": 344,
    "LCD COLOR ADJUSTMENT": 344,
    "IMAGE DISP": 345,
    "AUTOROTATE DISPLAYS": 345,
    "PREVIEW EXP/WB IN MANUAL MODE": 346,
    "NATURAL LIVE VIEW": 346,
    "F-Log VIEW ASSIST": 347,
    "ELECTRONIC LEVEL SETTING": 347,
    "FRAMING GUIDELINE": 347,
    "AUTOROTATE PB": 348,
    "PLAYBACK MAGNIFICATION": 348,
    "FOCUS SCALE UNITS": 348,
    "DUAL DISPLAY SETTING": 349,
    "EVF FULL SCREEN MAGNIFICATION": 349,
    "DISP. CUSTOM SETTING": 349,
    "LARGE INDICATORS MODE(EVF)": 350,
    "LARGE INDICATORS MODE(LCD)": 351,
    "LARGE INDICATORS DISP. SETTING": 352,
    "INFORMATION CONTRAST ADJ": 352,
    "LOCATION INFO": 353,
    "SUB MONITOR SETTING": 353,
    "SUB MONITOR BACKGROUND COLOR": 356,
    "(photography)(movie)Q MENU BACKGROUND": 356,

    "FOCUS LEVER SETTING": 357,
    "(photography)(movie)EDIT/SAVE QUICK MENU": 357,
    "FUNCTION (Fn) SETTING": 358,
    "COMMAND DIAL SETTING": 358
}


In [147]:
subsection_front_page24 = {
    "S.S. OPERATION": 359,
    "COMMAND DIAL DIRECTION": 359,
    "SHUTTER AF": 359,
    "SHUTTER AE": 360,
    "SHOOT WITHOUT LENS": 360,
    "SHOOT WITHOUT CARD": 360,
    "FOCUS RING": 361,
    "FOCUS RING OPERATION": 361,
    "AE/AF-LOCK MODE": 361,
    "AWB-LOCK MODE": 362,
    "EXPO. COMP. BUTTON SETTING": 362,
    "Fn BUTTON SETTING": 363,
    "TOUCH SCREEN SETTING": 364,
    "LOCK": 365,

    "AUTO POWER OFF": 366,
    "PERFORMANCE": 367,
    "SHOOTING STAND BY MODE": 368,
    "AUTO POWER SAVE": 368,
    "AUTO POWER OFF TEMP.": 368,

    "FRAME NO.": 369,
    "EDIT FILE NAME": 370,
    "(photography)CARD SLOT SETTING": 370,
    "SELECT SLOT ((photography)SEQUENTIAL)": 370,
    "SELECT FOLDER": 371,
    "COPYRIGHT INFO": 371,
    "DEFAULT CAPTION": 372,
    "IPTC": 372,
    "GEOTAGGING": 373
}


In [148]:
subsection_front_page25 = {
    "MY MENU SETTING": 377,
    "The Quick Menu Display": 379,
    "Viewing and Changing Settings": 381,
    "Editing the Quick Menu": 382,
    "The Function Buttons": 385,
    "Touch-Function Gestures": 390,

    "Lens Parts": 394,
    "Lens Care": 395,
    "Removing Lens Caps": 395,
    "Attaching Lens Hoods": 395,
    "Aperture Rings": 396,
    "T/S Lenses": 396,

    "Attaching the EVF-TL1": 398,
    "Using the EVF-TL1": 399,

    "Flash Settings": 401,
    "SYNC TERMINAL": 402,
    "SHOE MOUNT FLASH": 403,
    "COMMANDER (OPTICAL)": 406,

    "Attaching the Vertical Battery Grip": 411,
    "Inserting and Removing Batteries": 413,
    "Charging the Batteries": 415,

    "Attaching a Cooling Fan": 416,
    "Using the Fan": 418
}


In [149]:
subsection_front_page26 = {
    "Smartphone Apps": 423,
    "RAW FILE CONVERTER EX powered by SILKYPIX": 423,
    "Capture One Express for Fujifilm": 424,
    "Capture One for Fujifilm": 424,
    "FUJIFILM Tether Shooting Plug-in (Exclusively for Lightroom)": 424,
    "FUJIFILM X Acquire": 424,
    "FUJIFILM X RAW STUDIO": 425,
    "FUJIFILM Pixel Shift Combiner": 425,
    "Frame.io Camera to Cloud": 425,

    "Checking the Firmware Version": 439,
}


In [150]:
subsection_dict = {}
subsection_dict |= subsection_front_page13
subsection_dict |= subsection_front_page14
subsection_dict |= subsection_front_page15
subsection_dict |= subsection_front_page16
subsection_dict |= subsection_front_page17
subsection_dict |= subsection_front_page18
subsection_dict |= subsection_front_page19
subsection_dict |= subsection_front_page20
subsection_dict |= subsection_front_page21
subsection_dict |= subsection_front_page22
subsection_dict |= subsection_front_page23
subsection_dict |= subsection_front_page24
subsection_dict |= subsection_front_page25
subsection_dict |= subsection_front_page26

In [151]:
jh_temp_dict = {}
for section in subsection_dict:
    if subsection_dict[section] not in jh_temp_dict:
        jh_temp_dict[subsection_dict[section]] = [section]
    elif subsection_dict[section] in jh_temp_dict:
        jh_temp_dict[subsection_dict[section]].append(section)

In [152]:
for chapter in section_dict['section']:
    for section in section_dict['section'][chapter]:
        for subsection_page_num in jh_temp_dict:
            if section_dict['section'][chapter][section][0] <= subsection_page_num < section_dict['section'][chapter][section][1]:
                for document in memo_erase_documents:
                    if subsection_page_num <= document.metadata['page'] < section_dict['section'][chapter][section][1]:
                        document.metadata['subsection'] = jh_temp_dict[subsection_page_num]               

In [153]:
# section 들어가지 않은 page확인
for document in memo_erase_documents:
    if 'subsection' not in document.metadata:
        print(document.metadata['page'])

2
3
4
5
12
40
41
42
43
44
45
46
50
51
52
53
54
55
56
57
60
61
62
63
66
68
69
70
71
73
74
92
101
102
106
108
109
110
112
117
118
119
120
121
122
123
214
215
218
257
291
292
376
397
400
410
420
421
422
426
427
428
429
430
431
432
433
434
435
436
437
438
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469


In [157]:
# section 들어가지 않은 page는 section으로 대체
for document in memo_erase_documents:
    if 'subsection' not in document.metadata:
        document.metadata["subsection"] = [document.metadata["section"]]

In [158]:
# section 들어가지 않은 page확인
for document in memo_erase_documents:
    if 'subsection' not in document.metadata:
        print(document.metadata['page'])

In [159]:
camera_model = "gfx100ii"
json_dir = "../data/json"

output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for doc in memo_erase_documents:
    output_path = os.path.join(output_dir, f"{camera_model}_page{doc.metadata['page']}.json")
    doc.save_json(output_path)

../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page2.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page3.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page4.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page5.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page6.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page7.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page8.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page9.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page10.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page11.json complete
../data/json/gfx100ii/LlamaParseMultimodal/processed_data/gfx100ii_page12.json complete
../data/json/gfx100ii/LlamaParseMultimod