In [2]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain)
  Downloading langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.4-py3-none-any.whl.metadata (15 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.41-cp311-cp311-macosx_10_9_x86_64.whl.metadata (9.6 kB)
Collecting tenacity!=8.4.0,<10.0.0,>=8.1.0 (from langchain-core<1.0.0,>=0.3.66->langchain)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.66->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from pathlib import Path

In [29]:
def extract_chunks_from_json(json_data, source_path):
    title = json_data.get("title", "Unknown Title")
    infobox = json_data.get("infobox", {})
    sections = json_data.get("sections", {})

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    all_chunks = []

    infobox_text = ""
    for key, value in infobox.items():
        infobox_text += f"{key.strip()} {value.strip()}\n"
    if infobox_text.strip():
        for chunk in splitter.split_text(infobox_text):
            all_chunks.append({
                "text": chunk,
                "metadata": {
                    "source": source_path,
                    "title": title,
                    "section": "infobox"
                }
            })

    for section_name, content in sections.items():
        if content and content.strip():
            section_chunks = splitter.split_text(f"{section_name}:\n{content.strip()}")
            for chunk in section_chunks:
                all_chunks.append({
                    "text": chunk,
                    "metadata": {
                        "source": source_path,
                        "title": title,
                        "section": section_name
                    }
                })

    return all_chunks

In [30]:
def process_json_folder(folder_path):
    all_chunks = []
    for file in Path(folder_path).rglob("*.json"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            source_path = str(file.relative_to(folder_path))
            chunks = extract_chunks_from_json(data, source_path=source_path)
            all_chunks.extend(chunks)
    return all_chunks

In [33]:
anime_chunks = process_json_folder("Case_Closed_Files/Anime")
character_chunks = process_json_folder("Case_Closed_Files/Characters")
gadget_chunks = process_json_folder("Case_Closed_Files/Gadgets")
gen_chunks = process_json_folder("Case_Closed_Files/General")

In [34]:
print(f"✅ Total Anime Chunks: {len(anime_chunks)}")

for i, chunk in enumerate(anime_chunks[:5]):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk["text"])
    print("Metadata:", chunk["metadata"])

✅ Total Anime Chunks: 26359

--- Chunk 1 ---
Title: The 14th Round of the Matsue Tamatsukuri Linked Verse Contest
Japanese title: 松江玉造連句14番勝負  (Matsue Tamatsukuri Renku Jūyonban Shōbu)
Original airdate: October 29, 2001 (Part 1)  November 5, 2001 (Part 2)
Broadcast rating: 19.5%  18.6%
Filler case: #89
Season: 6
Manga source: TV Original
Cast: Conan Edogawa  Ran Mouri  Kogoro Mouri  Shinichi Kudo
Case solved by: Kogoro Mouri (via Conan)
Next Conan's Hint: Japanese maple (Part 1)  Hokku (Part 2)
Director: Yasuichiro Yamamoto
Metadata: {'source': 'The_14th_Round_of_the_Matsue_Tamatsukuri_Linked_Verse_Contest_(Part_1).json', 'title': 'The 14th Round of the Matsue Tamatsukuri Linked Verse Contest (Part 1)', 'section': 'infobox'}

--- Chunk 2 ---
Director: Yasuichiro Yamamoto
Screenplay: Kazunari Kouchi
Storyboard: 255: Yasuichiro Yamamoto  256: Murazou Sugisawa
Episode director: 255: Mashu Ito  256: Nana Harada
Animation director: 255: Hirobi Muranaka  256: Atsushi Aono
Character design: M

In [36]:
with open("anime_chunks.json", "w", encoding="utf-8") as f:
    json.dump(anime_chunks, f, ensure_ascii=False, indent=2)

with open("character_chunks.json", "w", encoding="utf-8") as f:
    json.dump(character_chunks, f, ensure_ascii=False, indent=2)

with open("gadget_chunks.json", "w", encoding="utf-8") as f:
    json.dump(gadget_chunks, f, ensure_ascii=False, indent=2)

with open("general_chunks.json", "w", encoding="utf-8") as f:
    json.dump(gen_chunks, f, ensure_ascii=False, indent=2)
