In [10]:
import os
import re
from CameraDocument import CameraDocument
from glob import glob

def extract_number(file_path):
    match = re.search(r'(\d+)\.json$', file_path)
    return int(match.group(1)) if match else None

camera_model_list = ["gfx100ii", "x-e4", "x-s20", "x-t5", "x100v"]
json_dir = "../../indexing/data/json"

documents = []
for camera_model in camera_model_list:
    detail_json_dir = os.path.join(json_dir, camera_model, "LlamaParseMultimodal", "processed_data")
    json_path = os.path.join(detail_json_dir, "*.json")
    json_list = glob(json_path)
    json_list = sorted(json_list, key=extract_number)

    for i, path in enumerate(json_list):
        document = CameraDocument()
        document.load_json(path)
        documents.append(document)

In [82]:
import pandas as pd

doc_id = []
contents = []
metadata = []
for i, document in enumerate(documents):
    doc_id.append(f"{document.metadata['model']}_page{document.metadata['page']}")
    contents.append(document.parsing_result)
    metadata.append(
        {
            "page": document.metadata['page'],
            "model": document.metadata['model'],
            "chapter": document.metadata['chapter'],
            "section": document.metadata['section'],
            "subsection": list(document.metadata['subsection'])
        }
    )
    
corpus_df = pd.DataFrame(
    {
        "doc_id": doc_id,
        "contents": contents,
        "metadata": metadata
    }
)

In [83]:
corpus_df.to_parquet("./project_dir/data/corpus.parquet")

In [90]:
import pandas as pd
a = pd.read_parquet("./project_dir/data/corpus.parquet", engine="pyarrow")

In [91]:
a

Unnamed: 0,doc_id,contents,metadata
0,gfx100ii_page2,# Parts of the Camera\n\n!Parts of the Camera\...,"{'chapter': 'Before You Begin', 'model': 'gfx1..."
1,gfx100ii_page3,# Parts of the Camera\n\n!Camera Diagram\n\n##...,"{'chapter': 'Before You Begin', 'model': 'gfx1..."
2,gfx100ii_page4,# Before You Begin\n\n!Camera Diagram\n\n### D...,"{'chapter': 'Before You Begin', 'model': 'gfx1..."
3,gfx100ii_page5,# The Cable Protector\n\nAttach the protector ...,"{'chapter': 'Before You Begin', 'model': 'gfx1..."
4,gfx100ii_page6,# Before You Begin\n\n## The Serial Number Pla...,"{'chapter': 'Before You Begin', 'model': 'gfx1..."
...,...,...,...
1838,x100v_page319,# Specifications\n\n## System\n\n### Metering\...,"{'chapter': 'Technical Notes', 'model': 'x100v..."
1839,x100v_page320,\n# System\n\n## Continuous\n\n| CONTINUOUS MO...,"{'chapter': 'Technical Notes', 'model': 'x100v..."
1840,x100v_page321,\n# Specifications\n\n## System\n\n### Flash M...,"{'chapter': 'Technical Notes', 'model': 'x100v..."
1841,x100v_page322,\n# Power Supply/Other\n\n## Power Supply\n- *...,"{'chapter': 'Technical Notes', 'model': 'x100v..."


In [59]:
corpus_df['metadata'][0]

{'page': 2,
 'model': 'gfx100ii',
 'chapter': 'Before You Begin',
 'section': 'Parts of the Camera',
 'subsection': ['Parts of the Camera']}

In [65]:
corpus_df['metadata'][0]

{'page': 2,
 'model': 'gfx100ii',
 'chapter': 'Before You Begin',
 'section': 'Parts of the Camera',
 'subsection': ['Parts of the Camera']}

In [104]:
len(a)

1843

In [109]:
for i in range(len(a)):
    a["metadata"][i]["subsection"] = list(a["metadata"][i]["subsection"])

In [110]:
a.loc[corpus_df["doc_id"] == "gfx100ii_page2", "metadata"].values[0]

{'chapter': 'Before You Begin',
 'model': 'gfx100ii',
 'page': 2,
 'section': 'Parts of the Camera',
 'subsection': ['Parts of the Camera']}

In [93]:
a['metadata'][1422]

{'chapter': 'The Setup Menus',
 'model': 'x-t5',
 'page': 287,
 'section': 'SCREEN SETTING',
 'subsection': array(['F-Log VIEW ASSIST', 'ELECTRONIC LEVEL SETTING'], dtype=object)}

In [75]:
a['metadata'][1422]

{'chapter': 'The Setup Menus',
 'model': 'x-t5',
 'page': 287,
 'section': 'SCREEN SETTING',
 'subsection': array(['F-Log VIEW ASSIST', 'ELECTRONIC LEVEL SETTING'], dtype=object)}

In [52]:
print(a['metadata'][0]["embedding_model"])

None
