In [1]:
import os
import re
from CameraDocument import CameraDocument
from glob import glob

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

camera_model_list = ["gfx100ii", "x-e4", "x-s20", "x-t5", "x100v"]
json_dir = "../../indexing/data/json"

documents = []
for camera_model in camera_model_list:
    detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
    json_path = os.path.join(detail_json_dir, "*.json")
    json_list = glob(json_path)
    json_list = sorted(json_list, key=extract_number)

    for i, path in enumerate(json_list):
        document = CameraDocument()
        document.load_json(path)
        documents.append(document)

In [10]:
import pandas as pd

doc_id = []
contents = []
metadata = []
for i, document in enumerate(documents):
    doc_id.append(f"{document.metadata['model']}_page{document.metadata['page']}")
    contents.append(document.parsing_result)
    metadata.append(document.metadata)
    
corpus_df = pd.DataFrame(
    {
        "doc_id": doc_id,
        "contents": contents,
        "metadata": metadata
    }
)

In [11]:
corpus_df

Unnamed: 0,doc_id,contents,metadata
0,gfx100ii_page2,# Parts of the Camera\n\n!Parts of the Camera\...,"{'page': 2, 'model': 'gfx100ii', 'chapter': 'B..."
1,gfx100ii_page3,# Parts of the Camera\n\n!Camera Diagram\n\n##...,"{'page': 3, 'model': 'gfx100ii', 'chapter': 'B..."
2,gfx100ii_page4,# Before You Begin\n\n!Camera Diagram\n\n### D...,"{'page': 4, 'model': 'gfx100ii', 'chapter': 'B..."
3,gfx100ii_page5,# The Cable Protector\n\nAttach the protector ...,"{'page': 5, 'model': 'gfx100ii', 'chapter': 'B..."
4,gfx100ii_page6,# Before You Begin\n\n## The Serial Number Pla...,"{'page': 6, 'model': 'gfx100ii', 'chapter': 'B..."
...,...,...,...
1838,x100v_page319,# Specifications\n\n## System\n\n### Metering\...,"{'page': 319, 'model': 'x100v', 'subsection': ..."
1839,x100v_page320,\n# System\n\n## Continuous\n\n| CONTINUOUS MO...,"{'page': 320, 'model': 'x100v', 'subsection': ..."
1840,x100v_page321,\n# Specifications\n\n## System\n\n### Flash M...,"{'page': 321, 'model': 'x100v', 'subsection': ..."
1841,x100v_page322,\n# Power Supply/Other\n\n## Power Supply\n- *...,"{'page': 322, 'model': 'x100v', 'subsection': ..."


In [13]:
corpus_df.to_parquet("./project_dir/data/corpus.parquet")