# pdf split

In [1]:
import os
from pikepdf import Pdf

camera_model = "x-s20"
pdf_path = f"../data/raw_data/{camera_model}/{camera_model}.pdf"

basename = os.path.basename(pdf_path)
splitname = os.path.splitext(basename)[0]

raw_dir = "../data/raw_data"
output_dir = os.path.join(raw_dir, splitname)
detail_output_dir = os.path.join(output_dir, f"{splitname}_split")

if not os.path.exists(detail_output_dir):
    os.makedirs(detail_output_dir)

pdf = Pdf.open(pdf_path)
for n, page in enumerate(pdf.pages):
    new_pdf = Pdf.new()
    new_pdf.pages.append(page)

    output_filename = os.path.join(detail_output_dir, splitname+f"_page{n+1}.pdf")

    new_pdf.save(output_filename)
    print(f"[+] File: {output_filename} saved.")

[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page1.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page2.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page3.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page4.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page5.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page6.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page7.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page8.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page9.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page10.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page11.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page12.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page13.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_split/x-s20_page14.pdf saved.
[+] File: ../data/raw_data/x-s20/x-s20_spli

# Llama Parase Multimodal

In [2]:
import os
import json
from dotenv import load_dotenv
from llama_parse import LlamaParse
from glob import glob

In [3]:
load_dotenv(dotenv_path="../.env", override=True)

Python-dotenv could not parse statement starting at line 2


True

In [4]:
parsing_instruction='''You are a highly proficient language model with advanced multimodal parsing capabilities. Your task is to process and extract content from a PDF document that serves as a camera manual.

**Specific Requirements:**

1. **Text Extraction:**  
   Extract all textual content from the document in a structured Markdown format. Preserve the hierarchy and formatting, including headings, subheadings, bullet points, numbered lists, and paragraphs.

2. **Image Identification and Inclusion:**  
   - Identify any images present in the document.  
   - For each image, insert a Markdown reference in the appropriate location using the syntax:  
     `![Image Description](ImagePlaceholder)`

3. **General Guidelines:**  
   - Maintain logical and clean formatting for Markdown output.
   - Avoid redundant or non-informative text (e.g., "Page number" or "Header/Footer").
   - Ensure extracted text and image references align with their original positions in the document.

**Example Output Structure:**  

# SHOOTING SETTING (Still Photography)
Adjust shooting options for still photography.

To display shooting settings, press **MENU/OK** in the photo shooting display and select the **SHOOTING SETTING** tab.

![Shooting Settings Menu](ImagePlaceholder)

> 💡 **Note:**
> - The options available vary with the shooting mode selected.

## SELF-TIMER
Choose a shutter release delay.

| Option   | Description                                                                                                         |
|----------|---------------------------------------------------------------------------------------------------------------------|
| 2 SEC | The shutter is released two seconds after the shutter button is pressed. Use to reduce blur caused by the camera moving when the shutter button is pressed. The self-timer lamp blinks as the timer counts down.  |
| 10 SEC| The shutter is released ten seconds after the shutter button is pressed. Use for photographs in which you wish to appear yourself. The self-timer lamp blinks immediately before the picture is taken. |
| OFF      | Self-timer off.                                                                                                    |

If an option other than **OFF** is selected, the timer will start when the shutter button is pressed all the way down. The display shows the number of seconds remaining until the shutter is released. To stop the timer before the picture is taken, press **DISP/BACK**.

![Timer Display](ImagePlaceholder)

> 💡 **Note:**
> - Stand behind the camera when using the shutter button. Standing in front of the lens can interfere with focus and exposure.
> - The self-timer turns off automatically when the camera is turned off.'''

parser = LlamaParse(
    api_key = os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt4o",
    vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
    parsing_instruction = parsing_instruction,
    invalidate_cache=True
)

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
camera_model = "x-s20"
splited_pdf_path = f"../data/raw_data/{camera_model}/{camera_model}_split/*.pdf"
pdf_list = glob(splited_pdf_path)

json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
if not os.path.exists(detail_json_dir):
    os.makedirs(detail_json_dir)

for pdf in pdf_list:
    json_objs = parser.get_json_result(pdf)

    file_path, file_name = os.path.split(pdf)
    name = os.path.splitext(file_name)[0]
    json_path = os.path.join(detail_json_dir, name+".json")
    
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(json_objs[0], file, ensure_ascii=False, indent=4)

Started parsing the file under job_id 77514a1d-d4cd-4ffa-a531-31fa1ea459cb
Started parsing the file under job_id f103d3db-1f01-4dfa-ac1b-86f28eabf305
Started parsing the file under job_id 5db961cf-8fbd-41a2-9d0e-cd860d498023
Started parsing the file under job_id f28bb93d-9398-419a-86f3-246915358940
Started parsing the file under job_id d951b466-ce4c-452d-b411-5f0e55a7d850
Started parsing the file under job_id f8e9634d-db60-49a1-9774-500f78f9f334
Started parsing the file under job_id 30c54c50-87f8-46c5-9c0d-708d82674b8a
Started parsing the file under job_id 2acc642a-6996-4e4e-b269-f87a131445fd
Started parsing the file under job_id eab276f1-9e08-4aad-8eb6-9a52fbd49b57
Started parsing the file under job_id 5312ea5a-af09-4174-8d5b-f01d5295a3cf
Started parsing the file under job_id 38bec291-2f7a-4dc9-9fd3-51467ed8eb55
Started parsing the file under job_id 6ac1cfda-ead9-46a3-a00b-3b5e4dc04447
Started parsing the file under job_id 78cbdd38-7879-4f9d-928a-1d86715960e0
Started parsing the file 

# Modify Json
- parsing_result
- metadata: {
    "model":,
    "page":
    }

In [5]:
import os
import re
import json
from glob import glob

from CameraDocument import CameraDocument

In [6]:
def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

In [7]:
camera_model = "x-s20"
json_dir = "../data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "init_result")
init_json_path = os.path.join(detail_json_dir, "*.json")

json_list = glob(init_json_path)
json_list = sorted(json_list, key=extract_number)

In [8]:
output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for json_path in json_list:
    doc = CameraDocument()

    page_num = extract_number(json_path)
    if page_num >= 27:
        real_page_num = page_num - 26
        doc.metadata['page'] = real_page_num
        basename = camera_model + f"_page{real_page_num}.json"
        output_path = os.path.join(output_dir, basename)
    else:
        doc.metadata['page'] = page_num
        basename = camera_model + f"_front_page{page_num}.json"
        front_output_dir = os.path.join(output_dir, "front_page")
        output_path = os.path.join(front_output_dir, basename)
        if not os.path.exists(front_output_dir):
            os.makedirs(front_output_dir)

    doc.metadata['model'] = camera_model

    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)
        doc.parsing_result = json_data["pages"][0]["md"]
        doc.save_json(output_path)


../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page1.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page2.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page3.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page4.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page5.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page6.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page7.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page8.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page9.json complete
../data/json/x-s20/LlamaParseMultimodal/add_metadata/front_page/x-s20_front_page10.json complete
../data/json/x-s20/LlamaParseMultimodal

# Index page to .png

In [9]:
import os
import fitz

In [10]:
def extract_pages_as_images(pdf_path, output_folder, start_page, end_page, dpi=600):
    pdf_document = fitz.open(pdf_path)

    # Loop through the specified page range
    for page_number in range(start_page - 1, end_page):
        # Check if the page number is within bounds
        if page_number < 0 or page_number >= len(pdf_document):
            print(f"Page {page_number + 1} is out of range. Skipping.")
            continue

        # Load the page
        page = pdf_document[page_number]

        # Render the page to a pixmap with specified DPI
        pix = page.get_pixmap(dpi=dpi)

        # Define the output image path
        output_path = f"{output_folder}/{camera_model}_page{page_number + 1}.png"

        # Save the image
        pix.save(output_path)
        print(f"Saved page {page_number + 1} as {output_path}")

    # Close the PDF document
    pdf_document.close()

In [11]:
camera_model = 'x-s20'
image_dir = '../data/image'
output_dir = os.path.join(image_dir, camera_model, "index_images")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
pdf_path = f"../data/raw_data/{camera_model}/{camera_model}.pdf"

In [12]:
extract_pages_as_images(pdf_path, output_dir, 12, 24, 600)

Saved page 12 as ../data/image/x-s20/index_images/x-s20_page12.png
Saved page 13 as ../data/image/x-s20/index_images/x-s20_page13.png
Saved page 14 as ../data/image/x-s20/index_images/x-s20_page14.png
Saved page 15 as ../data/image/x-s20/index_images/x-s20_page15.png
Saved page 16 as ../data/image/x-s20/index_images/x-s20_page16.png
Saved page 17 as ../data/image/x-s20/index_images/x-s20_page17.png
Saved page 18 as ../data/image/x-s20/index_images/x-s20_page18.png
Saved page 19 as ../data/image/x-s20/index_images/x-s20_page19.png
Saved page 20 as ../data/image/x-s20/index_images/x-s20_page20.png
Saved page 21 as ../data/image/x-s20/index_images/x-s20_page21.png
Saved page 22 as ../data/image/x-s20/index_images/x-s20_page22.png
Saved page 23 as ../data/image/x-s20/index_images/x-s20_page23.png
Saved page 24 as ../data/image/x-s20/index_images/x-s20_page24.png


# Delete
- chapter cover page
- memo page

In [68]:
import os
import re
from CameraDocument import CameraDocument
from glob import glob

camera_model = 'x-s20'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata", "front_page")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    if "front" in path:
        document = CameraDocument()
        document.load_json(path)
        documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'x-s20'}, parsing_result='# FUJIFILM\n\n## DIGITAL CAMERA X-S20\n\n**FF220002**  \nOwner’s Manual\n\n!Fujifilm X-S20 Cover', embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'x-s20'}, parsing_result='# Introduction\n\nThank you for your purchase of this product. Be sure that you have read this manual and understood its contents before using the camera. Keep the manual where it will be read by all who use the product.\n\n## For the Latest Information\n\nThe latest versions of the manuals are available from:\n\nhttps://fujifilm-dsc.com/en/manual/\n\n!QR Code for Manual\n\nThe website can be accessed not only from your computer but also from smartphones and tablets. It also contains information on the software license.\n\nFor information on firmware updates, visit:  \nhttps://fujifilm-x.com/support/download/firmware/cameras/\n\n!Devices', embedding_result=[]),
 CameraDocument(metadata={'page': 3, 'model': 'x-s20'}, parsing_r

In [69]:
for document in documents:
    if document.metadata['page'] == 15:
        print(document.parsing_result)

# The Shooting Menus

## IMAGE QUALITY SETTING (Still Photography)

- **IMAGE SIZE** .......................................................... 128
- **IMAGE QUALITY** .................................................. 129
- **RAW RECORDING** ............................................... 130
- **SELECT JPEG/HEIF** ............................................. 131
- **FILM SIMULATION** .............................................. 132
- **MONOCHROMATIC COLOR** ............................ 134
- **GRAIN EFFECT** .................................................... 134
- **COLOR CHROME EFFECT** ............................... 135
- **COLOR CHROME FX BLUE** ............................... 135
- **WHITE BALANCE** ................................................ 136
- **DYNAMIC RANGE** ............................................... 140
- **D RANGE PRIORITY** ........................................... 140
- **TONE CURVE** ....................................................... 141
- **CO

In [70]:
import os
import re
from CameraDocument import CameraDocument
from glob import glob

camera_model = 'x-s20'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "add_metadata")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)
json_list[:5]

documents = []
for i, path in enumerate(json_list):
    document = CameraDocument()
    document.load_json(path)
    documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 1, 'model': 'x-s20'}, parsing_result='!Before You Begin', embedding_result=[]),
 CameraDocument(metadata={'page': 2, 'model': 'x-s20'}, parsing_result='# Parts of the Camera\n\n!Camera Diagram\n\n1. **○ (movie recording) button** - Page 56\n2. **A (quick menu) button** - Page 311\n3. **ISO button** - Page 317\n4. **Rear command dial** - Pages 6, 223\n5. **Mode dial** - Pages 4, 48, 56\n6. **Hot shoe** - Page 338\n7. **Microphone** - Pages 61, 214\n8. **Fn (function) dial** - Pages 7, 320\n9. **N (flash pop-up) lever** - Page 123\n10. **Microphone/remote release connector cover**\n11. **Strap clip** - Page 32\n12. **Connector cover**\n13. **Speaker** - Pages 63, 277\n14. **Lens signal contacts**\n15. **Lens release button** - Page 33\n16. **AF-assist illuminator** - Page 155\n    - Self-timer lamp - Page 169\n    - Tally light - Page 200\n17. **Front command dial** - Page 6\n18. **ON/OFF switch** - Page 41\n19. **Shutter button** - Page 49\n20. **Flash*

In [71]:
x_s20_contents = {
    "Before You Begin": {
        "page_range": [1, 31],
        "sections": {
            "Parts of the Camera": {
                "page_range": [2, 10],
                "subsections": {
                    "The Mode Dial": [4, 5],
                    "The Focus Stick (Focus Lever)": [5, 6],
                    "The (Drive) Button": [5, 6],
                    "The Command Dials": [6, 7],
                    "The Fn (Function) Dial": [7, 8],
                    "The Serial Number Plate": [7, 8],
                    "The Indicator Lamp": [8, 9],
                    "The LCD Monitor": [9, 10],
                    "Focusing the Viewfinder": [9, 10],
                }
            },
            "Camera Displays": {
                "page_range": [10, 55],
                "subsections": {
                    "The Electronic Viewfinder": [10, 12],
                    "The LCD Monitor": [12, 14],
                    "Choosing a Display Mode": [14, 16],
                    "Adjusting Display Brightness": [16, 17],
                    "Display Rotation": [16, 17],
                    "The DISP/BACK Button": [17, 19],
                    "Customizing the Standard Display": [19, 21],
                    "Virtual Horizon": [21, 22],
                }
            },
            "Using the Menus": {
                "page_range": [22, 25],
                "subsections": {
                    "The Menus": [22, 24],
                    "Selecting a Menu Tab": [24, 25],
                }
            },
            "Touch Screen Mode": {
                "page_range": [25, 31],
                "subsections": {
                    "Shooting Touch Controls": [25, 30],
                    "Playback Touch Controls": [30, 31],
                }
            }
        }
    },
    "First Steps": {
        "page_range": [31, 47],
        "sections": {
            "Attaching the Strap": {"page_range": [32, 33]},
            "Attaching a Lens": {"page_range": [33, 34]},
            "Inserting the Battery and a Memory Card": {
                "page_range": [34, 37],
                "subsections": {
                    "Compatible Memory Cards": [36, 37]
                },
            },
            "Charging the Battery": {"page_range": [37, 41]},
            "Turning the Camera On and Off": {"page_range": [41, 42]},
            "Checking the Battery Level": {"page_range": [42, 43]},
            "Basic Setup": {
                "page_range": [43, 47],
                "subsections": {
                    "Choosing a Different Language": [45, 47],
                    "Changing the Time and Date": [45, 47],
                },
            },
        }
    },
    "Basic Photography and Playback": {
        "page_range": [47, 55],
        "sections": {
            "Taking Photographs": {"page_range": [48, 50]},
            "Viewing Pictures": {
                "page_range": [50, 53],
                "subsections": {
                    "HDMI Output": [51, 53]
                },
            },
            "Deleting Pictures": {"page_range": [53, 55]},
        }
    },
    "Movie Recording and Playback": {
        "page_range": [55, 67],
        "sections": {
            "Recording Movies": {
                "page_range": [56, 63],
                "subsections": {
                    "Recording Movies": [56, 62],
                    "Adjusting Movie Settings": [62, 63]
                }
            },
            "Viewing Movies": {
                "page_range": [63, 67],
                "subsections": {
                    "Viewing Movies During Playback": [63, 64],
                    "Viewing Movies During Vlog Recording": [64, 67]
                }
            }
        }
    },
    "Taking Photographs": {
        "page_range": [67, 127],
        "sections": {
            "Choosing a Shooting Mode": {
                "page_range": [68, 91],
                "subsections": {
                    "AUTO (AUTO)": [68, 71],
                    "Program AE (P)": [71, 73],
                    "Shutter-Priority AE (S)": [73, 75],
                    "Aperture-Priority AE (A)": [75, 77],
                    "Manual Exposure (M)": [77, 81],
                    "FILTER": [81, 82],
                    "Custom Modes": [82, 90],
                    "The Command and Function Dials": [90, 91]
                }
            },
            "Autofocus": {
                "page_range": [91, 100],
                "subsections": {
                    "Focus Mode": [92, 94],
                    "Autofocus Options (AF Mode)": [94, 96],
                    "Focus-Point Selection": [96, 100]
                }
            },
            "Manual Focus": {
                "page_range": [100, 104],
                "subsections": {
                    "Checking Focus": [101, 104]
                }
            },
            "Sensitivity": {
                "page_range": [104, 106],
                "subsections": {
                    "AUTO": [105, 106]
                }
            },
            "Metering": {
                "page_range": [106, 107]
            },
            "Exposure Compensation": {
                "page_range": [107, 108]
            },
            "Focus/Exposure Lock": {
                "page_range": [108, 110],
                "subsections": {
                    "Other Controls": [109, 110]
                }
            },
            "Bracketing": {
                "page_range": [110, 115],
                "subsections": {
                    "ISO BKT": [111, 112],
                    "WHITE BALANCE BKT": [111, 112],
                    "BKT Bracketing": [112, 115]
                }
            },
            "Continuous Shooting (Burst Mode)": {
                "page_range": [115, 116]
            },
            "HDR": {
                "page_range": [116, 118]
            },
            "Panoramas": {
                "page_range": [118, 121]
            },
            "Multiple Exposures": {
                "page_range": [121, 123]
            },
            "Flash Photography": {
                "page_range": [123, 127],
                "subsections": {
                    "Flash Settings": [125, 127]
                }
            }
        }
    },
    "The Shooting Menus": {
        "page_range": [127, 219],
        "sections": {
            "IMAGE QUALITY SETTING (Still Photography)": {
                "page_range": [128, 148],
                "subsections": {
                    "IMAGE SIZE": [128, 129],
                    "IMAGE QUALITY": [129, 130],
                    "RAW RECORDING": [130, 131],
                    "SELECT JPEG/HEIF": [131, 132],
                    "FILM SIMULATION": [132, 134],
                    "MONOCHROMATIC COLOR": [134, 135],
                    "GRAIN EFFECT": [134, 135],
                    "COLOR CHROME EFFECT": [135, 136],
                    "COLOR CHROME FX BLUE": [135, 136],
                    "WHITE BALANCE": [136, 140],
                    "DYNAMIC RANGE": [140, 141],
                    "D RANGE PRIORITY": [140, 141],
                    "TONE CURVE": [141, 142],
                    "COLOR": [141, 142],
                    "SHARPNESS": [141, 142],
                    "HIGH ISO NR": [142, 143],
                    "CLARITY": [142, 143],
                    "LONG EXPOSURE NR": [142, 143],
                    "LENS MODULATION OPTIMIZER": [143, 144],
                    "COLOR SPACE": [143, 144],
                    "PIXEL MAPPING": [143, 144],
                    "EDIT/SAVE CUSTOM SETTING": [144, 145],
                    "AUTO UPDATE CUSTOM SETTING": [144, 145],
                    "CUSTOM MODE SETTING": [144, 145],
                    "MOUNT ADAPTOR SETTING": [145, 148]
                }
            },
            "AF/MF SETTING (Still Photography)": {
                "page_range": [148, 167],
                "subsections": {
                    "FOCUS AREA": [148, 149],
                    "FOCUS MODE": [148, 149],
                    "AF MODE": [148, 149],
                    "AF-C CUSTOM SETTINGS": [149, 153],
                    "STORE AF MODE BY ORIENTATION": [153, 154],
                    "AF POINT DISPLAY": [153, 154],
                    "WRAP FOCUS POINT": [153, 154],
                    "NUMBER OF FOCUS POINTS": [154, 155],
                    "PRE-AF": [154, 155],
                    "AF ILLUMINATOR": [155, 156],
                    "FACE/EYE DETECTION SETTING": [156, 158],
                    "SUBJECT DETECTION SETTING": [158, 160],
                    "AF+MF": [160, 161],
                    "MF ASSIST": [161, 162],
                    "FOCUS CHECK": [161, 162],
                    "INTERLOCK SPOT AE & FOCUS AREA": [162, 163],
                    "INSTANT AF SETTING": [162, 163],
                    "DEPTH-OF-FIELD SCALE": [162, 163],
                    "RELEASE/FOCUS PRIORITY": [163, 164],
                    "AF RANGE LIMITER": [164, 165],
                    "TOUCH SCREEN MODE": [165, 167]
                }
            }, 
            "SHOOTING SETTING (Still Photography)": {
                "page_range": [167, 178],
                "subsections": {
                    "AUTO MODE SETTING": [167, 168],
                    "FILTER SETTING": [167, 168],
                    "SPORTS FINDER MODE": [168, 169],
                    "PRE-SHOT ES": [169, 170],
                    "SELF-TIMER": [169, 170],
                    "SAVE SELF-TIMER SETTING": [170, 171],
                    "SELF-TIMER LAMP": [170, 171],
                    "AE BKT SETTING": [170, 171],
                    "FILM SIMULATION BKT": [171, 172],
                    "FOCUS BKT SETTING": [171, 172],
                    "PHOTOMETRY": [171, 172],
                    "SHUTTER TYPE": [172, 173],
                    "INTERVAL TIMER SHOOTING": [173, 174],
                    "INTERVAL TIMER SHOOTING EXPOSURE SMOOTHING": [174, 175],
                    "INTERVAL PRIORITY MODE": [175, 176],
                    "FLICKER REDUCTION": [175, 176],
                    "FLICKERLESS S.S. SETTING": [176, 177],
                    "IS MODE": [176, 177],
                    "ISO": [177, 178],
                    "COOLING FAN SETTING": [177, 178],
                    "WIRELESS COMMUNICATION": [177, 178]
                }
            },
            "FLASH SETTING (Still Photography)": {
                "page_range": [178, 181],
                "subsections": {
                    "FLASH FUNCTION SETTING": [178, 179],
                    "RED EYE REMOVAL": [178, 179],
                    "TTL-LOCK MODE": [179, 180],
                    "LED LIGHT SETTING": [179, 180],
                    "COMMANDER SETTING": [180, 181],
                    "CH SETTING": [180, 181]
                }
            },
            "MOVIE SETTING (Still Photography)": {
                "page_range": [181, 186],
                "subsections": {
                    "MOVIE MODE": [181, 182],
                    "HIGH SPEED REC": [181, 182],
                    "MEDIA REC SETTING": [181, 182],
                    "IS MODE": [182, 185],
                    "IS MODE BOOST": [182, 185],
                    "AUDIO SETTING": [182, 185],
                    "MIC/REMOTE RELEASE": [185, 186],
                    "REC FRAME INDICATOR": [185, 186]
                }
            },
            "MOVIE SETTING (Movie Recording)": {
                "page_range": [186, 202],
                "subsections": {
                    "MOVIE SETTING LIST": [186, 187],
                    "SHOOTING MODE": [186, 187],
                    "MOVIE MODE": [187, 188],
                    "BACKGROUND DEFOCUS MODE": [188, 189],
                    "HIGH SPEED REC": [189, 190],
                    "SELF-TIMER": [190, 193],
                    "MEDIA REC SETTING": [190, 193],
                    "HDMI OUTPUT SETTING": [193, 194],

                    "FIX MOVIE CROP MAGNIFICATION": [194, 195],
                    "F-Log/HLG RECORDING": [195, 196],
                    "DATA LEVEL SETTING": [196, 197],
                    "PHOTOMETRY": [196, 197],
                    "FLICKERLESS S.S. SETTING": [196, 197],

                    "IS MODE": [197, 198],
                    "IS MODE BOOST": [197, 198],
                    "ISO": [197, 198],

                    "ZEBRA SETTING": [198, 199],
                    "ZEBRA LEVEL": [198, 199],

                    "MOVIE OPTIMIZED CONTROL": [199, 200],
                    "REC FRAME INDICATOR": [199, 200],

                    "TALLY LIGHT": [200, 201],

                    "COOLING FAN SETTING": [201, 202],
                    "EDIT/SAVE CUSTOM SETTING": [201, 202],
                    "AUTO UPDATE CUSTOM SETTING": [201, 202],
                    "CUSTOM MODE SETTING": [201, 202],
                    "WIRELESS COMMUNICATION": [201, 202]
                }
            },
            "IMAGE QUALITY SETTING (Movie Recording)": {
                "page_range": [202, 206],
                "subsections": {
                    "FILM SIMULATION": [202, 203],
                    "MONOCHROMATIC COLOR": [202, 203],
                    "WHITE BALANCE": [203, 204],
                    "DYNAMIC RANGE": [203, 204],
                    "TONE CURVE": [203, 204],
                    "COLOR": [203, 204],
                    "SHARPNESS": [204, 205],
                    "HIGH ISO NR": [204, 205],
                    "INTERFRAME NR": [204, 205],
                    "PERIPHERAL LIGHT CORRECTION": [205, 206],
                    "MOUNT ADAPTER SETTING": [205, 206]
                }
            },
            "AF/MF SETTING (Movie Recording)": {
                "page_range": [206, 214],
                "subsections": {
                    "FOCUS AREA": [206, 207],
                    "FOCUS MODE": [206, 207],
                    "AF MODE": [206, 207],

                    "AF-C CUSTOM SETTINGS": [207, 208],
                    "WRAP FOCUS POINT": [207, 208],

                    "AF ILLUMINATOR": [208, 209],
                    "PRODUCT PRIORITY MODE": [208, 209],
                    "FACE/EYE DETECTION SETTING": [208, 209],

                    "SUBJECT DETECTION SETTING": [209, 210],
                    "AF + MF": [209, 210],
                    "MF ASSIST": [209, 210],

                    "FOCUS CHECK": [210, 211],
                    "INSTANT AF SETTING": [210, 211],
                    "DEPTH-OF-FIELD SCALE": [210, 211],
                    "AF RANGE LIMITER": [210, 211],

                    "TOUCH SCREEN MODE": [211, 213],

                    "FOCUS CHECK LOCK": [213, 214]
                }
            },
            "AUDIO SETTING (Movie Recording)": {
                "page_range": [214, 217],
                "subsections": {
                    "INTERNAL MIC LEVEL ADJUSTMENT": [214, 215],
                    "EXTERNAL MIC LEVEL ADJUSTMENT": [214, 215],
                    "MIC JACK SETTING": [214, 215],

                    "MIC LEVEL LIMITER": [215, 216],
                    "WIND FILTER": [215, 216],
                    "LOW CUT FILTER": [215, 216],
                    "HEADPHONES VOLUME": [215, 216],
                    "MIC/REMOTE RELEASE": [215, 216],

                    "XLR MIC ADAPTER SETTING": [216, 217]
                }
            },
            "TIME CODE SETTING (Movie Recording)": {
                "page_range": [217, 219],
                "subsections": {
                    "TIME CODE DISPLAY": [217, 218],
                    "START TIME SETTING": [217, 218],
                    "COUNT UP SETTING": [217, 218],

                    "DROP FRAME": [218, 219],
                    "HDMI TIME CODE OUTPUT": [218, 219]
                }
            },
        }
    },
  "Playback and the Playback Menu": {
    "page_range": [219, 245],
    "sections": {
      "The Playback Display": {
        "page_range": [220, 223],
        "subsections": {
          "The DISP/BACK Button": [221, 223],
        }
      },
      "Viewing Pictures": {
        "page_range": [223, 225],
        "subsections": {
          "Playback Zoom": [224, 225],
          "Multi-Frame Playback": [224, 225],
        }
      },
      "The Playback Menu": {
        "page_range": [225, 245],
        "subsections": {
            "RAW CONVERSION": [225, 228],
            "HEIF TO JPEG/TIFF CONVERSION": [228, 229],
            "ERASE": [229, 231],
            "CROP": [231, 232],
            "RESIZE": [232, 233],
            "PROTECT": [233, 234],
            "IMAGE ROTATE": [234, 235],
            "VOICE MEMO SETTING": [235, 236],
            "RATING": [236, 237],
            "TRANSFER IMAGE TO SMARTPHONE": [237, 239],
            "WIRELESS COMMUNICATION": [239, 240],
            "SLIDE SHOW": [239, 240],
            "PHOTOBOOK ASSIST": [240, 242],
            "PRINT ORDER (DPOF)": [242, 243],
            "instax PRINTER PRINT": [243, 244],
            "DISP ASPECT": [244, 245]
        }
      }
    }
  },
  "Network/USB Features and Settings": {
    "page_range": [245, 269],
    "sections": {
      "Overview": {
        "page_range": [246, 247],
        "subsections": {
            "Supported Features": [246, 247]
        }
      },
      "Connecting to Smartphones (Bluetooth)": {
        "page_range": [247, 250],
        "subsections": {
            "Installing Smartphone Apps": [247, 249],
            "Connecting to a Smartphone": [247, 249],
            "Using the Smartphone App": [249, 250]
        }
      },
      "Connecting to Smartphones (USB)": {
        "page_range": [250, 255],
        "subsections": {
            "Copying Pictures to a Smartphone": [250, 253],
            "Connecting the Camera and Computer": [253, 255]
        }
      },
      "Using the Camera as a Webcam": {
        "page_range": [255, 256],
        "subsections": {}
      },
      "instax SHARE Printers": {
        "page_range": [256, 258],
        "subsections": {
            "Establishing a Connection": [256, 257],
            "Printing Pictures": [257, 258]
        }
      },
      "Tethered Photography": {
        "page_range": [258, 260],
        "subsections": {
            "Tethered Photography via USB": [258, 259],
            "Tethered Photography via Wireless LAN": [259, 260]
        }
      },
      "RAW Processing": {
        "page_range": [260, 261],
        "subsections": {}
      },
      "Saving and Loading Settings": {
        "page_range": [261, 262],
        "subsections": {
            "Saving and Loading Settings Using a Computer": [261, 262]
        }
      },
      "Network/USB Setting Menus": {
        "page_range": [262, 269],
        "subsections": {
          "Bluetooth/SMARTPHONE SETTING": [262, 265],
          "AIRPLANE MODE": [265, 266],
          "NETWORK SETTING": [265, 266],
          "instax PRINTER CONNECTION SETTING": [265, 266],
          "CONNECTION MODE": [266, 268],
          "USB POWER SUPPLY/COMM SETTING": [268, 269],
          "INFORMATION": [268, 269],
          "RESET WIRELESS SETTING": [268, 269]
        }
      },
    }
  },
  "The Setup Menus": {
    "page_range": [269, 307],
    "sections": {
      "USER SETTING": {
        "page_range": [270, 275],
        "subsections": {
          "FORMAT": [270, 271],
          "AREA SETTING": [271, 272],
          "DATE/TIME": [271, 272],
          "TIME DIFFERENCE": [272, 273],
          "LANG.": [272, 273],
          "MY MENU SETTING": [273, 274],
          "SENSOR CLEANING": [273, 274],
          "BATTERY AGE": [273, 274],
          "SOUND & FLASH": [274, 275],
          "RESET": [274, 275],
          "REGULATORY": [274, 275]
        }
      },
      "SOUND SETTING": {
        "page_range": [275, 278],
        "subsections": {
          "AF BEEP VOL.": [275, 276],
          "SELF-TIMER BEEP VOL.": [275, 276],
          "OPERATION VOL.": [275, 276],

          "REC START/STOP VOLUME": [276, 277],
          "MS ELECTRONIC SHUTTER VOLUME": [276, 277],
          "MS ELECTRONIC SHUTTER SOUND": [276, 277],

          "ES ELECTRONIC SHUTTER VOLUME": [277, 278],
          "ES ELECTRONIC SHUTTER SOUND": [277, 278],
          "PLAYBACK VOLUME": [277, 278],
          "4ch AUDIO PLAYBACK": [277, 278]
        }
      },
      "SCREEN SETTING": {
        "page_range": [278, 289],
        "subsections": {
          "VIEW MODE SETTING": [278, 279],
          "EVF BRIGHTNESS": [278, 279],
          "EVF COLOR": [278, 279],

          "EVF COLOR ADJUSTMENT": [279, 280],
          "LCD BRIGHTNESS": [279, 280],
          "LCD COLOR": [279, 280],
          "LCD COLOR ADJUSTMENT": [279, 280],

          "IMAGE DISP.": [280, 281],
          "AUTOROTATE DISPLAYS": [280, 281],
          
          "PREVIEW EXP./WB IN MANUAL MODE": [281, 282],
          "NATURAL LIVE VIEW": [281, 282],

          "F-Log VIEW ASSIST": [281, 283],
          "ELECTRONIC LEVEL SETTING": [282, 283],
          "FRAMING GUIDELINE": [282, 283],

          "AUTOROTATE PB": [283, 284],
          "FOCUS SCALE UNITS": [283, 284],
          "APERTURE UNIT FOR CINEMA LENS": [283, 284],
          "DISP. CUSTOM SETTING": [283, 284],

          "LARGE INDICATORS MODE(EVF)": [284, 285],

          "LARGE INDICATORS MODE(LCD)": [285, 286],

          "LARGE INDICATORS DISP. SETTING": [286, 287],

          "INFORMATION CONTRAST ADJ.": [287, 288],
          "LOCATION INFO.": [287, 288],

          "Q MENU BACKGROUND": [288, 289]
        }
      },
      "BUTTON/DIAL SETTING": {
        "page_range": [289, 301],
        "subsections": {
          "FOCUS LEVER SETTING": [289, 290],

          "EDIT/SAVE QUICK MENU": [290, 291],
          "FUNCTION (Fn) SETTING": [290, 291],
          "POWER ZOOM LENS FUNCTION (Fn) SETTING": [290, 291],

          "COMMAND DIAL SETTING": [291, 292],
          "S.S OPERATION": [291, 292],
          "COMMAND DIAL DIRECTION": [291, 292],

          "SHUTTER AF": [292, 293],
          "SHUTTER AE": [292, 293],

          "SHOOT WITHOUT LENS": [293, 294],
          "SHOOT WITHOUT CARD": [293, 294],

          "LENS ZOOM/FOCUS SETTING": [294, 296],

          "AE/AF-LOCK MODE": [296, 297],
          "AWB-LOCK MODE": [296, 297],

          "ISO BUTTON SETTING": [297, 298],
          "TOUCH SCREEN SETTING": [298, 300],
          "LOCK": [300, 301]
        }
      },
      "POWER MANAGEMENT": {
        "page_range": [301, 303],
        "subsections": {
          "AUTO POWER OFF": [301, 302],
          "PERFORMANCE": [301, 302],
          "EVF/LCD BOOST SETTING": [302, 303],
          "AUTO POWER OFF TEMP.": [302, 303]
        }
      },
      "SAVE DATA SETTING": {
        "page_range": [303, 307],
        "subsections": {
          "FRAME NO.": [303, 304],

          "EDIT FILE NAME": [304, 305],
          "SELECT FOLDER": [304, 305],

          "COPYRIGHT INFO.": [305, 307],
          "GEOTAGGING": [305, 307]
        }
      }
    }
  },
  "Shortcuts": {
    "page_range": [307, 329],
    "sections": {
      "Shortcut Options": {
        "page_range": [308, 309],
        "subsections": {}
      },
      "MY MENU": {
        "page_range": [309, 311],
        "subsections": {
          "MY MENU SETTING": [309, 311]
        }
      },
      "The Quick Menu": {
        "page_range": [311, 317],
        "subsections": {
          "The Quick Menu Display": [311, 313],
          "Viewing and Changing Settings": [313, 314],
          "Editing the Quick Menu": [314, 317]
        }
      },
      "Function Controls": {
        "page_range": [317, 329],
        "subsections": {
          "The Function Buttons": [317, 320],
          "The Function Dial": [320, 322],
          "Touch-Function Gestures": [322, 325],
          "Lens Function Buttons": [325, 329]
        }
      }
    }
  },
  "Peripherals and Optional Accessories": {
    "page_range": [329, 349],
    "sections": {
      "Lenses": {
        "page_range": [330, 337],
        "subsections": {
          "Lens Parts": [330, 331],

          "Lens Care": [331, 331],
          "Removing Lens Caps": [331, 332],

          "Attaching Lens Hoods": [332, 333],

          "Lenses with Aperture Rings": [333, 334],

          "Lenses with No Aperture Rings": [334, 334],
          "Lenses with O.I.S. Switches": [334, 335],

          "Manual Focus Lenses": [335, 336],
          "Power Zoom Lenses": [336, 337]
        }
      },
      "External Flash Units": {
        "page_range": [337, 347],
        "subsections": {
          "Using an External Flash": [338, 339],
          "SYNC TERMINAL": [339, 340],
          "SHOE MOUNT FLASH": [340, 343],
          "COMMANDER(OPTICAL)": [343, 347]
        }
      },
      "Cooling Fans": {
        "page_range": [347, 349],
        "subsections": {
          "Attaching a Cooling Fan": [347, 348],
          "Using the Fan": [348, 349]
        }
      }
    }
  },
  "Technical Notes": {
    "page_range": [349, 395],
    "sections": {
      "Accessories from Fujifilm": {
        "page_range": [350, 352],
        "subsections": {}
      },
      "Software for Use with Your Camera": {
        "page_range": [352, 355],
        "subsections": {
          "Smartphone Apps": [352, 353],
          "RAW FILE CONVERTER EX powered by SILKYPIX": [352, 353],

          "Capture One Express for Fujifilm": [353, 354],
          "Capture One for Fujifilm": [353, 354],
          "FUJIFILM Tether Shooting Plug-in (Exclusively for Lightroom)": [353, 354],

          "FUJIFILM X Acquire": [354, 355],
          "FUJIFILM X RAW STUDIO": [354, 355]
        }
      },
      "For Your Safety": {
        "page_range": [355, 366],
        "subsections": {}
      },
      "Product Care": {
        "page_range": [366, 367],
        "subsections": {}
      },
      "Cleaning the Image Sensor": {
        "page_range": [367, 368],
        "subsections": {}
      },
      "Firmware Updates": {
        "page_range": [368, 369],
        "subsections": {
          "Checking the Firmware Version": [368, 369]
        }
      },
      "Troubleshooting": {
        "page_range": [369, 382],
        "subsections": {}
      },
      "Warning Messages and Displays": {
        "page_range": [382, 387],
        "subsections": {}
      },
      "Memory Card Capacity": {
        "page_range": [387, 388],
        "subsections": {}
      },
      "Specifications": {
        "page_range": [388, 395],
        "subsections": {}
      }
    }
  }
}

In [72]:
chapter_dict = {"chapter":{}}
for chapter in x_s20_contents.keys():
    chapter_dict["chapter"][chapter] = x_s20_contents[chapter]["page_range"]

In [73]:
cover_page_list = []
for chapter in chapter_dict["chapter"]:
    cover_page_list.append(chapter_dict["chapter"][chapter][0])

cover_erase_documents = []
for document in documents:
    if document.metadata['page'] not in cover_page_list:
        cover_erase_documents.append(document)

In [74]:
print(cover_page_list)

[1, 31, 47, 55, 67, 127, 219, 245, 269, 307, 329, 349]


In [75]:
for doc in cover_erase_documents:
    if "MEMO" in doc.parsing_result:
        print(doc.metadata['page'])

19
46
54
66
235
306
328
383
395
396
397


In [76]:
memo_page_list = [46, 54, 66, 306, 328, 395, 396, 397, 398]

memo_erase_documents = []
for document in cover_erase_documents:
    if document.metadata['page'] not in memo_page_list:
        memo_erase_documents.append(document)

# add metadata
- metadata: {
    "chapter":,
    "section":,
    "subsection":
}

In [77]:
memo_erase_documents

[CameraDocument(metadata={'page': 2, 'model': 'x-s20'}, parsing_result='# Parts of the Camera\n\n!Camera Diagram\n\n1. **○ (movie recording) button** - Page 56\n2. **A (quick menu) button** - Page 311\n3. **ISO button** - Page 317\n4. **Rear command dial** - Pages 6, 223\n5. **Mode dial** - Pages 4, 48, 56\n6. **Hot shoe** - Page 338\n7. **Microphone** - Pages 61, 214\n8. **Fn (function) dial** - Pages 7, 320\n9. **N (flash pop-up) lever** - Page 123\n10. **Microphone/remote release connector cover**\n11. **Strap clip** - Page 32\n12. **Connector cover**\n13. **Speaker** - Pages 63, 277\n14. **Lens signal contacts**\n15. **Lens release button** - Page 33\n16. **AF-assist illuminator** - Page 155\n    - Self-timer lamp - Page 169\n    - Tally light - Page 200\n17. **Front command dial** - Page 6\n18. **ON/OFF switch** - Page 41\n19. **Shutter button** - Page 49\n20. **Flash** - Page 123\n21. **Body cap** - Page 33\n22. **Microphone/remote release connector (ø3.5 mm)** - Pages 61, 215\n2

In [78]:
for document in memo_erase_documents:
    document.metadata['subsection'] = []

    for chapter in x_s20_contents.keys():
        chapter_page_range = x_s20_contents[chapter]["page_range"]
        if chapter_page_range[0] <= document.metadata['page'] < chapter_page_range[1]:
            document.metadata['chapter'] = chapter

        for section in x_s20_contents[chapter]["sections"].keys():
            section_page_range = x_s20_contents[chapter]["sections"][section]['page_range']
            if section_page_range[0] <= document.metadata['page'] < section_page_range[1]:
                document.metadata['section'] = section

            if "subsections" in x_s20_contents[chapter]["sections"][section]:
                for subsection in x_s20_contents[chapter]["sections"][section]["subsections"].keys():
                    subsection_page_range = x_s20_contents[chapter]["sections"][section]['subsections'][subsection]                  
                    if subsection_page_range[0] <= document.metadata['page'] < subsection_page_range[1]:
                        document.metadata['subsection'].append(subsection)
            else:
                pass

In [79]:
memo_erase_documents

[CameraDocument(metadata={'page': 2, 'model': 'x-s20', 'subsection': [], 'chapter': 'Before You Begin', 'section': 'Parts of the Camera'}, parsing_result='# Parts of the Camera\n\n!Camera Diagram\n\n1. **○ (movie recording) button** - Page 56\n2. **A (quick menu) button** - Page 311\n3. **ISO button** - Page 317\n4. **Rear command dial** - Pages 6, 223\n5. **Mode dial** - Pages 4, 48, 56\n6. **Hot shoe** - Page 338\n7. **Microphone** - Pages 61, 214\n8. **Fn (function) dial** - Pages 7, 320\n9. **N (flash pop-up) lever** - Page 123\n10. **Microphone/remote release connector cover**\n11. **Strap clip** - Page 32\n12. **Connector cover**\n13. **Speaker** - Pages 63, 277\n14. **Lens signal contacts**\n15. **Lens release button** - Page 33\n16. **AF-assist illuminator** - Page 155\n    - Self-timer lamp - Page 169\n    - Tally light - Page 200\n17. **Front command dial** - Page 6\n18. **ON/OFF switch** - Page 41\n19. **Shutter button** - Page 49\n20. **Flash** - Page 123\n21. **Body cap** 

In [80]:
for document in memo_erase_documents:
    if not document.metadata['subsection']:
        print(document.metadata['page'])

2
3
32
33
34
35
37
38
39
40
41
42
43
44
48
49
50
53
91
100
104
106
107
108
110
115
116
117
118
119
120
121
122
123
124
220
223
255
260
308
337
350
351
355
356
357
358
359
360
361
362
363
364
365
366
367
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394


In [81]:
for document in memo_erase_documents:
    if not document.metadata['subsection']:
        document.metadata['subsection'].append(document.metadata['section'])

In [83]:
camera_model = "x-s20"
json_dir = "./data/json"

output_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for doc in memo_erase_documents:
    output_path = os.path.join(output_dir, f"{camera_model}_page{doc.metadata['page']}.json")
    doc.save_json(output_path)

./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page2.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page3.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page4.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page5.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page6.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page7.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page8.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page9.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page10.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page11.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page12.json complete
./data/json/x-s20/LlamaParseMultimodal/processed_data/x-s20_page13.json complete
./data/json/x-s20/LlamaParseMultimod

# Image Extract

In [43]:
from glob import glob
import re
import os

import sys
sys.path.append('../') 
from CameraDocument import CameraDocument

camera_model = 'x-s20'
json_dir = "./data/json"
detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
json_path = os.path.join(detail_json_dir, "*.json")

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)

documents = []
for i, path in enumerate(json_list):
    document = CameraDocument()
    document.load_json(path)
    documents.append(document)

documents[:5]

[CameraDocument(metadata={'page': 2, 'model': 'x-s20', 'chapter': 'Before You Begin', 'section': 'Parts of the Camera', 'subsection': 'Parts of the Camera'}, parsing_result='# Parts of the Camera\n\n!Camera Diagram\n\n1. **○ (movie recording) button** - Page 56\n2. **A (quick menu) button** - Page 311\n3. **ISO button** - Page 317\n4. **Rear command dial** - Pages 6, 223\n5. **Mode dial** - Pages 4, 48, 56\n6. **Hot shoe** - Page 338\n7. **Microphone** - Pages 61, 214\n8. **Fn (function) dial** - Pages 7, 320\n9. **N (flash pop-up) lever** - Page 123\n10. **Microphone/remote release connector cover**\n11. **Strap clip** - Page 32\n12. **Connector cover**\n13. **Speaker** - Pages 63, 277\n14. **Lens signal contacts**\n15. **Lens release button** - Page 33\n16. **AF-assist illuminator** - Page 155\n    - Self-timer lamp - Page 169\n    - Tally light - Page 200\n17. **Front command dial** - Page 6\n18. **ON/OFF switch** - Page 41\n19. **Shutter button** - Page 49\n20. **Flash** - Page 123

In [52]:
import fitz

raw_data_dir = "./data/raw_data"
image_dir = "./data/image"
output_dir = os.path.join(image_dir, camera_model, "pdf_to_image")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

dpi = 600  # 원하는 DPI 값 (300, 600 등)
for document in documents:
    page_num = document.metadata['page']
    pdf_path = os.path.join(raw_data_dir, camera_model, f"{camera_model}_split", f"{camera_model}_page{page_num+26}.pdf")

    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(0)
    pix = page.get_pixmap(dpi=dpi)

    output_file = os.path.join(output_dir, f"{camera_model}_page{page_num}.png")
    pix.save(output_file)
    print(f"Saved: {output_file}")

    pdf_document.close()  

Saved: ./data/image/x-s20/pdf_to_image/x-s20_page2.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page3.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page4.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page5.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page6.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page7.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page8.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page9.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page10.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page11.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page12.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page13.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page14.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page15.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page16.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page17.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page18.png
Saved: ./data/image/x-s20/pdf_to_image/x-s20_page19.png


In [64]:
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

True

In [54]:
from glob import glob
import re

def extract_number(file_path):
    match = re.search(r'(\d+)\.png$', file_path)
    return int(match.group(1)) if match else None

camera_model = "x-s20"
image_dir = "./data/image"
image_path = os.path.join(image_dir, camera_model, "pdf_to_image", "*.png")

images = glob(image_path)
images = sorted(images, key=extract_number)

In [56]:
len(images[:308])

308

In [57]:
len(images[308:])

69

In [58]:
len(images)

377

In [59]:
import requests
import json

json_dir = f"./data/json/{camera_model}/upstage"
if not os.path.exists(json_dir):
    os.makedirs(json_dir)

api_key = os.getenv("UPSTAGE_API_KEY")
url = "https://api.upstage.ai/v1/document-ai/document-parse"
headers = {"Authorization": f"Bearer {api_key}"}

for image_path in images[:308]:
    filename = image_path
    files = {"document": open(filename, "rb")}
    data = {"output_formats":"['html','markdown']"}
    response_test = requests.post(url, headers=headers, files=files, data=data)

    json_path = os.path.join(json_dir, os.path.splitext(os.path.basename(image_path))[0] +".json")
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(response_test.json(), file, ensure_ascii=False, indent=4)
    print(f"저장 완료: {json_path}")

저장 완료: ./data/json/x-s20/upstage/x-s20_page2.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page3.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page4.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page5.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page6.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page7.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page8.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page9.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page10.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page11.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page12.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page13.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page14.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page15.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page16.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page17.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page18.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page19.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page20.json
저장 완료: ./data/json/x-s20/upstage/x-s20_

In [65]:
import requests
import json

json_dir = f"./data/json/{camera_model}/upstage"
if not os.path.exists(json_dir):
    os.makedirs(json_dir)

api_key = os.getenv("UPSTAGE_API_KEY")
url = "https://api.upstage.ai/v1/document-ai/document-parse"
headers = {"Authorization": f"Bearer {api_key}"}

for image_path in images[308:]:
    filename = image_path
    files = {"document": open(filename, "rb")}
    data = {"output_formats":"['html','markdown']"}
    response_test = requests.post(url, headers=headers, files=files, data=data)

    json_path = os.path.join(json_dir, os.path.splitext(os.path.basename(image_path))[0] +".json")
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(response_test.json(), file, ensure_ascii=False, indent=4)
    print(f"저장 완료: {json_path}")

저장 완료: ./data/json/x-s20/upstage/x-s20_page323.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page324.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page325.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page326.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page327.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page330.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page331.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page332.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page333.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page334.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page335.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page336.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page337.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page338.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page339.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page340.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page341.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page342.json
저장 완료: ./data/json/x-s20/upstage/x-s20_page343.json
저장 완료: ./dat

In [66]:
from glob import glob
import re
import os

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

camera_model = "x-s20"
json_dir = "./data/json"
json_path = os.path.join(json_dir, camera_model, "upstage", "*.json")

json_list = glob(json_path)
json_list = sorted(json_list, key=extract_number)

In [67]:
import json
from PIL import Image

image_dir = "./data/image"
pdf_image_dir = os.path.join(image_dir, camera_model, "pdf_to_image")
output_dir = os.path.join(image_dir, camera_model, "extracted_images")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for json_path in json_list:
    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    filtered_elements = []
    for element in json_data['elements']:
        if element.get('category') == 'figure':
            filtered_elements.append(element.get('coordinates'))

    image_path = os.path.join(pdf_image_dir, os.path.splitext(os.path.basename(json_path))[0]+'.png')
    for idx, coordinate in enumerate(filtered_elements):
        try:
            with Image.open(image_path) as img:
                width, height = img.size

                # 바운딩 박스 좌표 계산
                x_min = int(min(coord['x'] * width for coord in coordinate))
                y_min = int(min(coord['y'] * height for coord in coordinate))
                x_max = int(max(coord['x'] * width for coord in coordinate))
                y_max = int(max(coord['y'] * height for coord in coordinate))

                # 바운딩 박스 크롭
                cropped_image = img.crop((x_min, y_min, x_max, y_max))

                # 결과 이미지 저장
                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(json_path))[0]+f"_image{idx+1}.png")
                cropped_image.save(output_path)
                print(f"Cropped image saved to: {output_path}")

        except FileNotFoundError:
            print(f"File not found: {image_path}")

Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page2_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page3_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page4_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page5_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page5_image2.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page7_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page7_image2.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page8_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page9_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page9_image2.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page10_image1.png
Cropped image saved to: ./data/image/x-s20/extracted_images/x-s20_page12_image1.png
Cr