# 01 â€” Ingestion (Corpus + Index)

This notebook:
1) builds a processed corpus JSONL from `data/raw/`
2) builds retrieval index (BM25 / FAISS / hybrid)
3) sanity-checks corpus stats and examples

In [None]:
from pathlib import Path
import json
import random
import pandas as pd

RAW_DIR = Path("data/raw")
CORPUS_PATH = Path("data/processed/corpus.jsonl")
INDEX_DIR = Path("data/processed/index")

RAW_DIR, CORPUS_PATH, INDEX_DIR

In [None]:
# Show sample raw files (text-like)
exts = {".txt", ".md", ".json", ".jsonl"}
raw_files = [p for p in RAW_DIR.rglob("*") if p.is_file() and p.suffix.lower() in exts]
print("Raw files found:", len(raw_files))
raw_files[:10]

In [None]:
import subprocess, sys

cmd = [
    sys.executable, "scripts/build_corpus.py",
    "--raw_dir", str(RAW_DIR),
    "--out_path", str(CORPUS_PATH),
    "--chunk_chars", "2000",
    "--overlap_chars", "200",
    "--min_chars", "200",
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)

In [None]:
def load_jsonl(path: Path, max_rows: int = 5000):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_rows:
                break
            rows.append(json.loads(line))
    return rows

rows = load_jsonl(CORPUS_PATH, max_rows=2000)
print("Loaded rows:", len(rows))

# Show a few random docs
for r in random.sample(rows, k=min(3, len(rows))):
    print("="*80)
    print("ID:", r["id"])
    print("META:", r.get("meta", {}))
    print("TEXT (first 500 chars):")
    print(r["text"][:500])

In [None]:
df = pd.DataFrame({
    "id": [r["id"] for r in rows],
    "chars": [len(r["text"]) for r in rows],
    "source": [r.get("meta", {}).get("filename", "") for r in rows],
})
df.describe()

In [None]:
import subprocess, sys

cmd = [
    sys.executable, "scripts/build_index.py",
    "--corpus_path", str(CORPUS_PATH),
    "--out_dir", str(INDEX_DIR),
    "--backend", "bm25",   # change to faiss / hybrid if you installed deps
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)

list(INDEX_DIR.glob("*"))

In [None]:
bm25_path = INDEX_DIR / "bm25.json"
print("BM25 exists:", bm25_path.exists(), bm25_path)
print("Index files:", [p.name for p in INDEX_DIR.glob("*")])