In [1]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader


def metadata_func(record, metadata):
    metadata.update({
        "title": record.get("title", ""),
        "section": record.get("section", ""),
        "chunk_index": record.get("chunk_index", ""),
    })
    return metadata

loader = DirectoryLoader(
    "./data",
    glob="attack_on_Titan_Namu_new_part*.jsonl",
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".",
        "json_lines": True,
        "content_key": "text",
        "metadata_func": metadata_func,
    },
)


def prepend_metadata(doc):
    title = (doc.metadata.get("title") or "").strip()
    section = (doc.metadata.get("section") or "").strip()
    prefix_parts = []
    if title:
        prefix_parts.append(f"제목: {title}")
    if section:
        prefix_parts.append(f"섹션: {section}")
    if prefix_parts:
        doc.page_content = "\n".join(prefix_parts) + "\n\n" + doc.page_content
    return doc

document_list = [prepend_metadata(doc) for doc in loader.load()]


In [2]:
# Crawler enforces max tokens; do not re-split here.
split_docs = document_list


In [None]:
# split_docs

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500,
    chunk_overlap=90,
)
max_doc = max(split_docs, key=lambda d: splitter._length_function(d.page_content))
splitter._length_function(max_doc.page_content), max_doc.metadata


(3832,
 {'source': '/mnt/e/one_piece/data/attack_on_Titan_Namu_new_part1.jsonl',
  'seq_num': 7508,
  'title': '엘빈 스미스/작중 행적 - 나무위키',
  'section': '3.7.1. 작가의 설명',
  'chunk_index': '45'})

In [4]:
from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings

load_dotenv()

embedding = UpstageEmbeddings(model="solar-embedding-1-large", embed_batch_size=1)


In [5]:
import os
import shutil
from langchain_chroma import Chroma

persist_directory = "./AoT"
if os.path.exists(persist_directory):
    shutil.rmtree(persist_directory)

database = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding,
    collection_name="AoT",
    persist_directory=persist_directory,
)
