In [1]:
from pathlib import Path
import os
import ast
import re
from typing import Tuple, Dict, Any, List

from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langdetect import detect
from deep_translator import GoogleTranslator


In [2]:
load_dotenv()

CHUNKS_DIR = Path(os.environ["MOODLE_CHUNKS_DIR"])
PERSIST_DIR = os.environ["MOODLE_CHROMA_DB_DIR"]
COLLECTION_NAME = os.environ.get("MOODLE_COLLECTION_NAME", "moodle_docs")


In [None]:
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [4]:

def parse_chunk_file(path: Path) -> Tuple[str, Dict[str, Any]]:
    text = path.read_text(encoding="utf-8").strip()

    # Ожидаем формат:
    # ---
    # {'doc_title': ..., ...}
    # ---
    # <content>
    m = re.match(r"^\s*---\s*\n(.*?)\n---\s*\n?(.*)$", text, flags=re.DOTALL)
    if not m:
        # если вдруг файл без меты
        return text, {"source_file": str(path)}

    meta_str = m.group(1).strip()
    content = m.group(2).strip()

    try:
        metadata = ast.literal_eval(meta_str)  # т.к. у тебя сейчас python dict строкой
        if not isinstance(metadata, dict):
            metadata = {}
    except Exception:
        metadata = {}

    metadata["source_file"] = str(path)
    return content, metadata


In [5]:
docs: List[Document] = []

for chunk_path in sorted(CHUNKS_DIR.rglob("chunk_*.md")):
    content, metadata = parse_chunk_file(chunk_path)
    if not content.strip():
        continue
    docs.append(Document(page_content=content, metadata=metadata))

print(f"Loaded documents: {len(docs)}")


Loaded documents: 2697


In [6]:
# убираем пустые списки в metadata иначе chroma падает

for d in docs:
    d.metadata = {
        k: v for k, v in d.metadata.items()
        if not (isinstance(v, list) and len(v) == 0)
    }

In [7]:
batch_size = 1000

vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=hf_embeddings,
    persist_directory=PERSIST_DIR,
)

for i in range(0, len(docs), batch_size):
    vector_store.add_documents(docs[i:i + batch_size])

print("Indexed:", len(docs))


Indexed: 2697


In [8]:
query = "How do I create a new course in Moodle?"

results = vector_store.similarity_search_with_score(query, k=3)

for i, (doc, score) in enumerate(results, 1):
    sim = 1 - score

    print(f"\n=== Result {i} ===")
    print("Similarity:", sim)              # больше = лучше
    print("Metadata:", doc.metadata)
    print("Text preview:", doc.page_content[:400], "...")




=== Result 1 ===
Similarity: 0.7398324012756348
Metadata: {'chunk_index': 0, 'source_file': '/Users/sergey/Desktop/Moodle_RAG/data/moodle_docs/chunks_md/403__en__Adding_a_new_course/chunk_0000.md', 'youtube_links': ['https://youtu.be/MzK2jb-9SwE'], 'h2': 'Adding a course', 'h1': 'Adding a new course', 'source_links': ['https://docs.moodle.org/403/en/Adding_a_new_course', 'https://docs.moodle.org/403/en/File:26addcourse1.png', 'https://docs.moodle.org/403/en/File:26defaultcoursevalues.png', 'https://docs.moodle.org/403/en/File:coursesort.png', 'https://docs.moodle.org/403/en/File:template1.png', 'https://docs.moodle.org/403/en/File:template2.png', 'https://docs.moodle.org/403/en/images_en/1/14/26defaultcoursevalues.png', 'https://docs.moodle.org/403/en/images_en/6/60/coursesort.png', 'https://docs.moodle.org/403/en/images_en/f/f5/newcoursesavereturn.png', 'https://docs.moodle.org/403/en/images_en/thumb/1/14/26defaultcoursevalues.png/300px-26defaultcoursevalues.png', 'https://docs.moodl

In [9]:
def prepare_query(user_query: str):
    lang = detect(user_query)
    if lang == "ru":
        query_en = GoogleTranslator(source="ru", target="en").translate(user_query)
    else:
        query_en = user_query
    return query_en, lang


In [10]:
# короткая проверка translate -> retrieve + similarity

user_query = "Как создать новый курс в Moodle?"

query_en, user_lang = prepare_query(user_query)
results = vector_store.similarity_search_with_score(query_en, k=5)

print("user_lang:", user_lang)
print("query_en:", query_en)
print("retrieved:", len(results))

for i, (doc, score) in enumerate(results, 1):
    sim = 1 - score  # условная cosine similarity

    print(f"\n=== Result {i} ===")
    print("Similarity:", sim)          # больше = лучше
    print("Metadata:", doc.metadata)
    print("Text preview:", doc.page_content[:400], "...")


user_lang: ru
query_en: How to create a new course in Moodle?
retrieved: 5

=== Result 1 ===
Similarity: 0.7389518618583679
Metadata: {'youtube_links': ['https://youtu.be/MzK2jb-9SwE'], 'chunk_index': 0, 'h2': 'Adding a course', 'source_links': ['https://docs.moodle.org/403/en/Adding_a_new_course', 'https://docs.moodle.org/403/en/File:26addcourse1.png', 'https://docs.moodle.org/403/en/File:26defaultcoursevalues.png', 'https://docs.moodle.org/403/en/File:coursesort.png', 'https://docs.moodle.org/403/en/File:template1.png', 'https://docs.moodle.org/403/en/File:template2.png', 'https://docs.moodle.org/403/en/images_en/1/14/26defaultcoursevalues.png', 'https://docs.moodle.org/403/en/images_en/6/60/coursesort.png', 'https://docs.moodle.org/403/en/images_en/f/f5/newcoursesavereturn.png', 'https://docs.moodle.org/403/en/images_en/thumb/1/14/26defaultcoursevalues.png/300px-26defaultcoursevalues.png', 'https://docs.moodle.org/403/en/images_en/thumb/1/14/26defaultcoursevalues.png/450px-26default