In [7]:
# scripts/ingest.py
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
import json
import shutil
import tempfile
import subprocess

from pathlib import Path
from typing import List, Dict, Any
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    UnstructuredWordDocumentLoader,
)
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm

# === –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø ===
DATA_DIR = Path("../data")
VECTOR_DB_PATH = Path("../vectorstore")
SAMPLES_PATH = Path("../samples/sample_chunks.json")

# –ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–µ —É—Ä–æ–≤–Ω–∏ (–¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –ø–∞–ø–∫–∞–º–∏ –≤ DATA_DIR)
LEVELS = ["elementary", "middle_school", "high_school", "university"]

# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã —á–∞–Ω–∫–∏–Ω–≥–∞ –ø–æ —É—Ä–æ–≤–Ω—é
CHUNK_PARAMS = {
    "elementary": {"chunk_size": 300, "chunk_overlap": 50},
    "middle_school": {"chunk_size": 400, "chunk_overlap": 60},
    "high_school": {"chunk_size": 500, "chunk_overlap": 80},
    "university": {"chunk_size": 700, "chunk_overlap": 100},
}

# –≠–º–±–µ–¥–¥–∏–Ω–≥-—Ñ—É–Ω–∫—Ü–∏—è (–ª–æ–∫–∞–ª—å–Ω–∞—è, –±–µ–∑ API)
# embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
#     model_name="all-MiniLM-L6-v2"
# )
embedding_model = SentenceTransformer("/home/sofya/all-MiniLM-L6-v2")

def extract_grade_from_path(file_path: Path, level_dir: Path) -> str:
    """
    –ò–∑–≤–ª–µ–∫–∞–µ—Ç –Ω–æ–º–µ—Ä –∫–ª–∞—Å—Å–∞ –∏–∑ –ø—É—Ç–∏, –µ—Å–ª–∏ –µ—Å—Ç—å –ø–æ–¥–ø–∞–ø–∫–∞ –≤–∏–¥–∞ '5 –∫–ª–∞—Å—Å', '10 –∫–ª–∞—Å—Å' –∏ —Ç.–¥.
    –ò—â–µ—Ç —Ç–æ–ª—å–∫–æ –≤–Ω—É—Ç—Ä–∏ level_dir.
    """
    try:
        rel_parts = file_path.relative_to(level_dir).parts
        for part in rel_parts:
            if "–∫–ª–∞—Å—Å" in part or 'course' in part:
                match = re.search(r'(\d+)', part)
                if match:
                    return match.group(1)
    except ValueError:
        pass
    return "general"  # –µ—Å–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω–æ

def get_all_document_files(base_dir: Path) -> List[Path]:
    """–†–µ–∫—É—Ä—Å–∏–≤–Ω–æ –Ω–∞—Ö–æ–¥–∏—Ç –≤—Å–µ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–µ —Ñ–∞–π–ª—ã –≤ –ø–∞–ø–∫–µ."""
    supported_ext = {".pdf", ".doc", ".docx", ".djvu"}
    files = []
    for file_path in base_dir.rglob("*"):
        if file_path.is_file() and file_path.suffix.lower() in supported_ext:
            # –ò–≥–Ω–æ—Ä–∏—Ä—É–µ–º —Å–∫—Ä—ã—Ç—ã–µ —Ñ–∞–π–ª—ã (–Ω–∞–ø—Ä–∏–º–µ—Ä, .DS_Store)
            if file_path.name.startswith("."):
                continue
            files.append(file_path)
    return files


def load_document(file_path: Path):
    """–ó–∞–≥—Ä—É–∂–∞–µ—Ç –¥–æ–∫—É–º–µ–Ω—Ç, –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç PDF, DOC(X), DJVU (—á–µ—Ä–µ–∑ –∫–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏—é)."""
    ext = file_path.suffix.lower()
    
    if ext == ".pdf":
        return PyMuPDFLoader(str(file_path)).load()
    
    elif ext in (".doc", ".docx"):
        return UnstructuredWordDocumentLoader(str(file_path)).load()
    
    elif ext == ".djvu":
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω –ª–∏ ddjvu
        if not shutil.which("ddjvu"):
            print("  ‚ö†Ô∏è –£—Ç–∏–ª–∏—Ç–∞ 'ddjvu' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –£—Å—Ç–∞–Ω–æ–≤–∏—Ç–µ –ø–∞–∫–µ—Ç 'djvulibre'. DJVU-—Ñ–∞–π–ª –ø—Ä–æ–ø—É—â–µ–Ω.")
            return []
        
        try:
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
                tmp_pdf_path = Path(tmp_pdf.name)

            # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º DJVU ‚Üí PDF
            result = subprocess.run(
                ["ddjvu", "-format=pdf", str(file_path), str(tmp_pdf_path)],
                capture_output=True,
                text=True,
                timeout=120  # –º–∞–∫—Å 2 –º–∏–Ω—É—Ç—ã –Ω–∞ —Ñ–∞–π–ª
            )

            if result.returncode != 0:
                print(f"  ‚ö†Ô∏è –û—à–∏–±–∫–∞ –∫–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏–∏ DJVU ‚Üí PDF: {result.stderr}")
                tmp_pdf_path.unlink(missing_ok=True)
                return []

            # –ó–∞–≥—Ä—É–∂–∞–µ–º –ø–æ–ª—É—á–µ–Ω–Ω—ã–π PDF
            docs = PyMuPDFLoader(str(tmp_pdf_path)).load()
            tmp_pdf_path.unlink(missing_ok=True)  # —É–¥–∞–ª—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–π —Ñ–∞–π–ª
            return docs

        except subprocess.TimeoutExpired:
            print(f"  ‚ö†Ô∏è –¢–∞–π–º–∞—É—Ç –∫–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏–∏ DJVU: {file_path.name}")
            tmp_pdf_path.unlink(missing_ok=True)
            return []
        except Exception as e:
            print(f"  ‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ DJVU: {e}")
            tmp_pdf_path.unlink(missing_ok=True)
            return []
    
    else:
        return []

def main():
    client = chromadb.PersistentClient(path=str(VECTOR_DB_PATH))
    all_sample_chunks = []

    for level in LEVELS:
        level_dir = DATA_DIR / level
        if not level_dir.exists():
            print(f"‚ö†Ô∏è –£—Ä–æ–≤–µ–Ω—å '{level}' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç (–ø–∞–ø–∫–∞ {level_dir} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞). –ü—Ä–æ–ø—É—Å–∫–∞–µ–º.")
            continue

        print(f"\nüìÇ –û–±—Ä–∞–±–æ—Ç–∫–∞ —É—Ä–æ–≤–Ω—è: {level}")
        file_paths = get_all_document_files(level_dir)
        print(f"  –ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: {len(file_paths)}")

        if not file_paths:
            print(f"  ‚ö†Ô∏è –ù–µ—Ç –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã—Ö —Ñ–∞–π–ª–æ–≤ –≤ {level_dir}")
            continue

        # –ü–∞—Ä–∞–º–µ—Ç—Ä—ã —á–∞–Ω–∫–∏–Ω–≥–∞
        params = CHUNK_PARAMS.get(level, CHUNK_PARAMS["university"])
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=params["chunk_size"],
            chunk_overlap=params["chunk_overlap"],
            separators=["\n\n", "\n", ". ", " ", ""],
            length_function=len,
        )

        all_chunks = []
        all_metadatas = []
        all_texts = []
        all_ids = []

        for file_path in tqdm(file_paths, desc=f"  –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ ({level})"):
            if 'checkpoint' in file_path.name:
                continue
            try:
                documents = load_document(file_path)
                if not documents:
                    continue

                # –†–∞–∑–±–∏–≤–∞–µ–º –Ω–∞ —á–∞–Ω–∫–∏
                chunks = text_splitter.split_documents(documents)

                grade = extract_grade_from_path(file_path, level_dir)
                # –û—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–π –ø—É—Ç—å –¥–ª—è –∏—Å—Ç–æ—á–Ω–∏–∫–∞ (–±—É–¥–µ—Ç –ø–æ–∫–∞–∑–∞–Ω –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é)
                source_rel_path = str(file_path.relative_to(DATA_DIR))

                for i, chunk in enumerate(chunks):
                    text = chunk.page_content.strip()
                    if not text:
                        continue

                    metadata = {
                        "level": level,
                        "grade": grade,
                        "source": source_rel_path,  # <-- —ç—Ç–æ –±—É–¥–µ—Ç –≤ –æ—Ç–≤–µ—Ç–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è!
                        "filename": file_path.name,
                    }
                    # –î–æ–±–∞–≤–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ (–Ω–∞–ø—Ä–∏–º–µ—Ä, –Ω–æ–º–µ—Ä —Å—Ç—Ä–∞–Ω–∏—Ü—ã –∏–∑ PDF)
                    if hasattr(chunk, 'metadata') and isinstance(chunk.metadata, dict):
                        metadata.update({
                            k: v for k, v in chunk.metadata.items()
                            if isinstance(v, (str, int, float, bool)) and k not in metadata
                        })

                    chunk_id = f"{level}_{file_path.stem}_{i}"

                    all_texts.append(text)
                    all_metadatas.append(metadata)
                    all_ids.append(chunk_id)

                    # –°–æ–±–∏—Ä–∞–µ–º —Å—ç–º–ø–ª—ã (–Ω–µ –±–æ–ª–µ–µ 10 —á–∞–Ω–∫–æ–≤ –≤—Å–µ–≥–æ)
                    if len(all_sample_chunks) < 10:
                        all_sample_chunks.append({
                            "id": chunk_id,
                            "text": text[:200] + "..." if len(text) > 200 else text,
                            "metadata": metadata
                        })

            except Exception as e:
                print(f"\n  ‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {file_path}: {e}")
                continue

        print(f"  –í—Å–µ–≥–æ —á–∞–Ω–∫–æ–≤ –¥–ª—è —É—Ä–æ–≤–Ω—è '{level}': {len(all_texts)}")

        if not all_texts:
            print(f"  ‚ö†Ô∏è –ù–µ—Ç —á–∞–Ω–∫–æ–≤ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –≤ {level}")
            continue

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Chroma
        # collection = client.get_or_create_collection(
        #     name=level,
        #     embedding_function=embedding_fn
        # )
        collection = client.get_or_create_collection(
            name=level,
            embedding_function=None  # –æ—Ç–∫–ª—é—á–∞–µ–º –∞–≤—Ç–æ-—ç–º–±–µ–¥–¥–∏–Ω–≥
        )
                # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
        embeddings = embedding_model.encode(all_texts, convert_to_numpy=True, show_progress_bar=False)
        embeddings = embeddings.tolist()  # Chroma –æ–∂–∏–¥–∞–µ—Ç list[list[float]]
        
        collection.add(
            documents=all_texts,
            metadatas=all_metadatas,
            embeddings=embeddings,
            ids=all_ids
        )
        collection.add(
            documents=all_texts,
            metadatas=all_metadatas,
            ids=all_ids
        )

        print(f"  ‚úÖ –£—Ä–æ–≤–µ–Ω—å '{level}' —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ Chroma.")

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—ç–º–ø–ª—ã
    SAMPLES_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(SAMPLES_PATH, "w", encoding="utf-8") as f:
        json.dump(all_sample_chunks, f, indent=2, ensure_ascii=False, default=str)

    print(f"\nüéâ –ò–Ω–≥–µ—Å—Ç –∑–∞–≤–µ—Ä—à—ë–Ω. –°—ç–º–ø–ª—ã —á–∞–Ω–∫–æ–≤: {SAMPLES_PATH}")

# if __name__ == "__main__":
#     main()

In [9]:
VECTOR_DB_PATH

PosixPath('../vectorstore')

In [11]:
client = chromadb.PersistentClient(path=str(VECTOR_DB_PATH))
all_sample_chunks = []

for level in LEVELS:
    level_dir = DATA_DIR / level
    if not level_dir.exists():
        print(f"‚ö†Ô∏è –£—Ä–æ–≤–µ–Ω—å '{level}' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç (–ø–∞–ø–∫–∞ {level_dir} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞). –ü—Ä–æ–ø—É—Å–∫–∞–µ–º.")
        continue

    print(f"\nüìÇ –û–±—Ä–∞–±–æ—Ç–∫–∞ —É—Ä–æ–≤–Ω—è: {level}")
    file_paths = get_all_document_files(level_dir)
    print(f"  –ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: {len(file_paths)}")

    if not file_paths:
        print(f"  ‚ö†Ô∏è –ù–µ—Ç –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã—Ö —Ñ–∞–π–ª–æ–≤ –≤ {level_dir}")
        continue

    # –ü–∞—Ä–∞–º–µ—Ç—Ä—ã —á–∞–Ω–∫–∏–Ω–≥–∞
    params = CHUNK_PARAMS.get(level, CHUNK_PARAMS["university"])
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=params["chunk_size"],
        chunk_overlap=params["chunk_overlap"],
        separators=["\n\n", "\n", ". ", " ", ""],
        length_function=len,
    )

    all_chunks = []
    all_metadatas = []
    all_texts = []
    all_ids = []

    for file_path in tqdm(file_paths, desc=f"  –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ ({level})"):
        if 'checkpoint' in file_path.name:
            continue
        try:
            documents = load_document(file_path)
            if not documents:
                continue

            # –†–∞–∑–±–∏–≤–∞–µ–º –Ω–∞ —á–∞–Ω–∫–∏
            chunks = text_splitter.split_documents(documents)

            grade = extract_grade_from_path(file_path, level_dir)
            # –û—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–π –ø—É—Ç—å –¥–ª—è –∏—Å—Ç–æ—á–Ω–∏–∫–∞ (–±—É–¥–µ—Ç –ø–æ–∫–∞–∑–∞–Ω –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é)
            source_rel_path = str(file_path.relative_to(DATA_DIR))

            for i, chunk in enumerate(chunks):
                text = chunk.page_content.strip()
                if not text:
                    continue

                metadata = {
                    "level": level,
                    "grade": grade,
                    "source": source_rel_path,  # <-- —ç—Ç–æ –±—É–¥–µ—Ç –≤ –æ—Ç–≤–µ—Ç–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è!
                    "filename": file_path.name,
                }
                # –î–æ–±–∞–≤–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ (–Ω–∞–ø—Ä–∏–º–µ—Ä, –Ω–æ–º–µ—Ä —Å—Ç—Ä–∞–Ω–∏—Ü—ã –∏–∑ PDF)
                if hasattr(chunk, 'metadata') and isinstance(chunk.metadata, dict):
                    metadata.update({
                        k: v for k, v in chunk.metadata.items()
                        if isinstance(v, (str, int, float, bool)) and k not in metadata
                    })

                chunk_id = f"{level}_{file_path.stem}_{i}"

                all_texts.append(text)
                all_metadatas.append(metadata)
                all_ids.append(chunk_id)

                # –°–æ–±–∏—Ä–∞–µ–º —Å—ç–º–ø–ª—ã (–Ω–µ –±–æ–ª–µ–µ 10 —á–∞–Ω–∫–æ–≤ –≤—Å–µ–≥–æ)
                if len(all_sample_chunks) < 10:
                    all_sample_chunks.append({
                        "id": chunk_id,
                        "text": text[:200] + "..." if len(text) > 200 else text,
                        "metadata": metadata
                    })

        except Exception as e:
            print(f"\n  ‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {file_path}: {e}")
            continue

    print(f"  –í—Å–µ–≥–æ —á–∞–Ω–∫–æ–≤ –¥–ª—è —É—Ä–æ–≤–Ω—è '{level}': {len(all_texts)}")

    if not all_texts:
        print(f"  ‚ö†Ô∏è –ù–µ—Ç —á–∞–Ω–∫–æ–≤ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –≤ {level}")
        continue

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Chroma
    # collection = client.get_or_create_collection(
    #     name=level,
    #     embedding_function=embedding_fn
    # )
    collection = client.get_or_create_collection(
        name=level,
        embedding_function=None  # –æ—Ç–∫–ª—é—á–∞–µ–º –∞–≤—Ç–æ-—ç–º–±–µ–¥–¥–∏–Ω–≥
    )
            # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
    embeddings = embedding_model.encode(all_texts, convert_to_numpy=True, show_progress_bar=False)
    embeddings = embeddings.tolist()  # Chroma –æ–∂–∏–¥–∞–µ—Ç list[list[float]]
    
    collection.add(
        documents=all_texts,
        metadatas=all_metadatas,
        embeddings=embeddings,
        ids=all_ids
    )
    collection.add(
        documents=all_texts,
        metadatas=all_metadatas,
        ids=all_ids
    )

    print(f"  ‚úÖ –£—Ä–æ–≤–µ–Ω—å '{level}' —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ Chroma.")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—ç–º–ø–ª—ã
SAMPLES_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(SAMPLES_PATH, "w", encoding="utf-8") as f:
    json.dump(all_sample_chunks, f, indent=2, ensure_ascii=False, default=str)

print(f"\nüéâ –ò–Ω–≥–µ—Å—Ç –∑–∞–≤–µ—Ä—à—ë–Ω. –°—ç–º–ø–ª—ã —á–∞–Ω–∫–æ–≤: {SAMPLES_PATH}")

‚ö†Ô∏è –£—Ä–æ–≤–µ–Ω—å 'elementary' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç (–ø–∞–ø–∫–∞ ../data/elementary –Ω–µ –Ω–∞–π–¥–µ–Ω–∞). –ü—Ä–æ–ø—É—Å–∫–∞–µ–º.

üìÇ –û–±—Ä–∞–±–æ—Ç–∫–∞ —É—Ä–æ–≤–Ω—è: middle_school
  –ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: 39


  –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ (middle_school): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [04:06<00:00,  6.33s/it]


  –í—Å–µ–≥–æ —á–∞–Ω–∫–æ–≤ –¥–ª—è —É—Ä–æ–≤–Ω—è 'middle_school': 977


InternalError: Query error: Database error: error returned from database: (code: 1032) attempt to write a readonly database

In [None]:
collection = client.get_or_create_collection(
    name=level,
    embedding_function=None  # –æ—Ç–∫–ª—é—á–∞–µ–º –∞–≤—Ç–æ-—ç–º–±–µ–¥–¥–∏–Ω–≥
)
        # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
embeddings = embedding_model.encode(all_texts, convert_to_numpy=True, show_progress_bar=False)
embeddings = embeddings.tolist()  # Chroma –æ–∂–∏–¥–∞–µ—Ç list[list[float]]

collection.add(
    documents=all_texts,
    metadatas=all_metadatas,
    embeddings=embeddings,
    ids=all_ids
)
collection.add(
    documents=all_texts,
    metadatas=all_metadatas,
    ids=all_ids
)

In [2]:
all_sample_chunks

NameError: name 'all_sample_chunks' is not defined

In [4]:
# inspect_chunks.py
from pathlib import Path
import chromadb

# –ü—É—Ç—å –∫ –≤–∞—à–µ–π Chroma-–±–∞–∑–µ
VECTOR_DB_PATH = Path("./vectorstore")  # –∏–ª–∏ "../vectorstore", –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏—è


client = chromadb.PersistentClient(path=str(VECTOR_DB_PATH))
collections = client.list_collections()

if not collections:
    print("‚ùå –ù–µ—Ç –∫–æ–ª–ª–µ–∫—Ü–∏–π –≤ –±–∞–∑–µ.")
    # return

print("üìö –î–æ—Å—Ç—É–ø–Ω—ã–µ –∫–æ–ª–ª–µ–∫—Ü–∏–∏:")
for i, col in enumerate(collections):
    print(f"  {i+1}. {col.name} (—á–∞–Ω–∫–æ–≤: {col.count()})")

print("\n" + "="*60)

# –ü—Ä–æ—Å–º–æ—Ç—Ä –≤—Å–µ—Ö –∫–æ–ª–ª–µ–∫—Ü–∏–π
for collection in collections:
    print(f"\nüîç –ö–æ–ª–ª–µ–∫—Ü–∏—è: {collection.name}")
    print("-" * 40)
    
    # –ü–æ–ª—É—á–∞–µ–º –ø–µ—Ä–≤—ã–µ 3 —á–∞–Ω–∫–∞
    try:
        data = collection.peek(limit=3)  # peek ‚Äî –±–µ–∑–æ–ø–∞—Å–Ω—ã–π –ø—Ä–æ—Å–º–æ—Ç—Ä
    except Exception as e:
        print(f"  ‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏: {e}")
        continue

    n = len(data['ids'])
    for i in range(n):
        print(f"\n[ID] {data['ids'][i]}")
        print(f"[–¢–µ–∫—Å—Ç] {data['documents'][i][:300]}{'...' if len(data['documents'][i]) > 300 else ''}")
        print(f"[–ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ] {data['metadatas'][i]}")
    
    if n == 0:
        print("  (–ø—É—Å—Ç–æ)")



‚ùå –ù–µ—Ç –∫–æ–ª–ª–µ–∫—Ü–∏–π –≤ –±–∞–∑–µ.
üìö –î–æ—Å—Ç—É–ø–Ω—ã–µ –∫–æ–ª–ª–µ–∫—Ü–∏–∏:



In [5]:
collections

[]

In [6]:
# –ü—Ä–∏–º–µ—Ä: –Ω–∞–π—Ç–∏ —á–∞–Ω–∫–∏ –∏–∑ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ —Ñ–∞–π–ª–∞
collection = client.get_collection("high_school")
results = collection.get(
    where={"source": "high_school/10-11 –∫–ª–∞—Å—Å—ã/–ê–ª–≥–µ–±—Ä–∞ –∏ –Ω–∞—á–∞–ª–∞ –º–∞—Ç. –∞–Ω–∞–ª–∏–∑–∞. 11–∫–ª. –ß.2. –ó–∞–¥–∞—á–Ω–∏–∫.doc"}
)
print(f"–ù–∞–π–¥–µ–Ω–æ —á–∞–Ω–∫–æ–≤: {len(results['ids'])}")
for i in range(min(2, len(results['ids']))):
    print("‚Äî", results['documents'][i][:200])

NotFoundError: Collection [high_school] does not exist

In [11]:

all_metadata = []
for level in LEVELS:
    level_dir = DATA_DIR / level
    if not level_dir.exists():
        print(f"‚ö†Ô∏è –£—Ä–æ–≤–µ–Ω—å '{level}' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç (–ø–∞–ø–∫–∞ {level_dir} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞). –ü—Ä–æ–ø—É—Å–∫–∞–µ–º.")
        continue

    print(f"\nüìÇ –û–±—Ä–∞–±–æ—Ç–∫–∞ —É—Ä–æ–≤–Ω—è: {level}")
    file_paths = get_all_document_files(level_dir)
    print(f"  –ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: {len(file_paths)}")

    if not file_paths:
        print(f"  ‚ö†Ô∏è –ù–µ—Ç –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã—Ö —Ñ–∞–π–ª–æ–≤ –≤ {level_dir}")
        continue

    # –ü–∞—Ä–∞–º–µ—Ç—Ä—ã —á–∞–Ω–∫–∏–Ω–≥–∞
    params = CHUNK_PARAMS.get(level, CHUNK_PARAMS["university"])
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=params["chunk_size"],
        chunk_overlap=params["chunk_overlap"],
        separators=["\n\n", "\n", ". ", " ", ""],
        length_function=len,
    )

    all_chunks = []
    all_metadatas = []
    all_texts = []
    all_ids = []
    metadata = {}

    for file_path in tqdm(file_paths, desc=f"  –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ ({level})"):
        if 'checkpoint' in file_path.name:
            continue
        try:
            documents = load_document(file_path)
            if not documents:
                continue

            # –†–∞–∑–±–∏–≤–∞–µ–º –Ω–∞ —á–∞–Ω–∫–∏
            # chunks = text_splitter.split_documents(documents)

            grade = extract_grade_from_path(file_path, level_dir)
            # –û—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–π –ø—É—Ç—å –¥–ª—è –∏—Å—Ç–æ—á–Ω–∏–∫–∞ (–±—É–¥–µ—Ç –ø–æ–∫–∞–∑–∞–Ω –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é)
            source_rel_path = str(file_path.relative_to(DATA_DIR))

            # for i, chunk in enumerate(chunks):
            #     text = chunk.page_content.strip()
            #     if not text:
            #         continue

            #     metadata = {
            #         "level": level,
            #         "grade": grade,
            #         "source": source_rel_path,  # <-- —ç—Ç–æ –±—É–¥–µ—Ç –≤ –æ—Ç–≤–µ—Ç–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è!
            #         "filename": file_path.name,
            #     }
            #     # –î–æ–±–∞–≤–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ (–Ω–∞–ø—Ä–∏–º–µ—Ä, –Ω–æ–º–µ—Ä —Å—Ç—Ä–∞–Ω–∏—Ü—ã –∏–∑ PDF)
            #     if hasattr(chunk, 'metadata') and isinstance(chunk.metadata, dict):
            #         metadata.update({
            #             k: v for k, v in chunk.metadata.items()
            #             if isinstance(v, (str, int, float, bool)) and k not in metadata
            #         })

            #     chunk_id = f"{level}_{file_path.stem}_{i}"

            #     all_texts.append(text)
            #     all_metadatas.append(metadata)
            #     all_ids.append(chunk_id)

            #     # –°–æ–±–∏—Ä–∞–µ–º —Å—ç–º–ø–ª—ã (–Ω–µ –±–æ–ª–µ–µ 10 —á–∞–Ω–∫–æ–≤ –≤—Å–µ–≥–æ)
            #     if len(all_sample_chunks) < 10:
            #         all_sample_chunks.append({
            #             "id": chunk_id,
            #             "text": text[:200] + "..." if len(text) > 200 else text,
            #             "metadata": metadata
            #         })

        except Exception as e:
            print(f"\n  ‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {file_path}: {e}")
            continue
            
        all_metadata.append(metadata)
    
    print(f"  –í—Å–µ–≥–æ —á–∞–Ω–∫–æ–≤ –¥–ª—è —É—Ä–æ–≤–Ω—è '{level}': {len(all_texts)}")

    if not all_texts:
        print(f"  ‚ö†Ô∏è –ù–µ—Ç —á–∞–Ω–∫–æ–≤ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –≤ {level}")
        continue

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Chroma
    # collection = client.get_or_create_collection(
    #     name=level,
    #     embedding_function=embedding_fn
    # )
    # collection.add(
    #     documents=all_texts,
    #     metadatas=all_metadatas,
    #     ids=all_ids
    # )
    print(f"  ‚úÖ –£—Ä–æ–≤–µ–Ω—å '{level}' —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ Chroma.")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Å—ç–º–ø–ª—ã
# SAMPLES_PATH.parent.mkdir(parents=True, exist_ok=True)
# with open(SAMPLES_PATH, "w", encoding="utf-8") as f:
#     json.dump(all_sample_chunks, f, indent=2, ensure_ascii=False, default=str)

print(f"\nüéâ –ò–Ω–≥–µ—Å—Ç –∑–∞–≤–µ—Ä—à—ë–Ω. –°—ç–º–ø–ª—ã —á–∞–Ω–∫–æ–≤: {SAMPLES_PATH}")

‚ö†Ô∏è –£—Ä–æ–≤–µ–Ω—å 'elementary' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç (–ø–∞–ø–∫–∞ ../data/elementary –Ω–µ –Ω–∞–π–¥–µ–Ω–∞). –ü—Ä–æ–ø—É—Å–∫–∞–µ–º.

üìÇ –û–±—Ä–∞–±–æ—Ç–∫–∞ —É—Ä–æ–≤–Ω—è: middle_school
  –ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: 39


  –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ (middle_school):  13%|‚ñà‚ñè       | 5/39 [01:50<12:30, 22.07s/it]


KeyboardInterrupt: 

In [12]:
all_metadata

[{}, {}, {}, {}, {}]