In [2]:
import json
from pathlib import Path

def get_base_dir() -> Path:
    try:
        return Path(__file__).resolve().parent
    except NameError:
        return Path.cwd()

BASE_DIR = get_base_dir().parent  # go one directory up
OUTPUTS_DIR = BASE_DIR / "outputs"

pecha_files = list(OUTPUTS_DIR.rglob("*.json"))
pechas = [json.loads(pecha_file.read_text()) for pecha_file in pecha_files]

print(f"Number of pechas : {len(pechas)}")


Number of pechas : 268


## Chunking

In [24]:
from bo_sent_tokenizer import segment

def merge_chunks(chunks: list[str], max_count: int = 50):
    res = []
    for i in range(0, len(chunks), max_count):
        chunk = ' '.join(chunks[i:i + max_count])
        res.append(chunk)
    
    # If last chunk is small, merge it back into previous
    if len(res) > 1 and len(chunks) % max_count != 0:
        res[-2] = res[-2] + ' ' + res[-1]
        res.pop(-1)
    
    return res


def chunk_text(text: str):
    # Return a flat list of sentence strings, not a nested list
    chunks: list[str] = segment(text).splitlines()
    chunks = [chunk for chunk in chunks if chunk]
    return chunks


In [25]:

for pecha in tqdm(pechas, desc="Chunking"):
    pecha_id = pecha["pecha_id"]
    texts = pecha["texts"]

    pecha["chunks"] = {}

    if len(list(texts.keys())) <= 1:
        text = texts[list(texts.keys())[0]]
        
        volume_id = list(texts.keys())[0]
        chunks = chunk_text(text)
        chunks = merge_chunks(chunks)
        pecha["chunks"][volume_id] = chunks
    else:
        for volume_id, volume_text in texts.items():
            volume_text = volume_text.strip()
            chunks = chunk_text(volume_text)
            chunks = merge_chunks(chunks)
            pecha["chunks"][volume_id] = chunks

    # Save chunks
    pecha_path = OUTPUTS_DIR / "chunks" / f"{pecha_id}.json"
    pecha_path.parent.mkdir(parents=True, exist_ok=True)
    pecha_path.write_text(json.dumps(pecha, ensure_ascii=False, indent=2), encoding="utf-8")

Chunking:   0%|          | 0/268 [00:00<?, ?it/s]

Chunking: 100%|██████████| 268/268 [00:43<00:00,  6.10it/s]
