In [None]:
# pip install sentence-transformers faiss-cpu
from sentence_transformers import SentenceTransformer
import numpy as np, faiss, json, re

model_name = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_name)

def parse_md_chunks(filepath):
    with open(filepath, encoding="utf-8") as f:
        content = f.read()

    parts = re.split(r'^\s*---\s*$', content, flags=re.MULTILINE)
    docs, idx = [], 0
    for chunk in parts:
        chunk = chunk.strip()
        if not chunk:
            continue
        m = re.match(r'^\s*#\s*(.+?)\s*(?:\r?\n)+(.*)\Z', chunk, flags=re.DOTALL)
        if m:
            title = re.sub(r"\s+", " ", m.group(1)).strip()
            body  = m.group(2).strip()
        else:
            title = f"chunk_{idx+1:03}"
            body  = chunk
        idx += 1
        docs.append({"id": f"chunk_{idx:03}", "title": title, "body": body})
    return docs

docs = parse_md_chunks("data.md")

# --- ВАЖНО: чистим строки отдельно, без f-строк с backslash внутри выражения ---
titles_clean = [re.sub(r"\s+", " ", d["title"]).strip() for d in docs]
bodies_clean = [re.sub(r"\s+", " ", d["body"]).strip()  for d in docs]

titles_prepared = [f"passage_title: {t}" for t in titles_clean]
bodies_prepared = [f"passage_body: {b}"  for b in bodies_clean]

E_title = model.encode(titles_prepared, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
E_body  = model.encode(bodies_prepared,  batch_size=64, show_progress_bar=True, normalize_embeddings=True)

E_title = np.asarray(E_title, dtype="float32")
E_body  = np.asarray(E_body,  dtype="float32")
assert E_title.shape == E_body.shape
dim = E_title.shape[1]

idx_title = faiss.IndexFlatIP(dim)
idx_body  = faiss.IndexFlatIP(dim)
idx_title.add(E_title)
idx_body.add(E_body)

faiss.write_index(idx_title, "smartwop_title.faiss")
faiss.write_index(idx_body,  "smartwop_body.faiss")
with open("smartwop_docs_meta.json", "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print(f"Готово: {idx_title.ntotal} title-векторов, {idx_body.ntotal} body-векторов")
