In [5]:
# pip install sentence-transformers faiss-cpu
from sentence_transformers import SentenceTransformer
import numpy as np, faiss, json, re, unicodedata

model_name = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_name)

# ---------- НОРМАЛИЗАЦИЯ ----------
_ZW_CHARS = [
    "\u200B",  # ZERO WIDTH SPACE
    "\u200C",  # ZERO WIDTH NON-JOINER
    "\u200D",  # ZERO WIDTH JOINER
    "\uFEFF",  # BOM
]
def normalize_text(s: str) -> str:
    if not s:
        return ""
    # Unicode NFC
    s = unicodedata.normalize("NFC", s)
    # заменить NBSP на обычный пробел
    s = s.replace("\u00A0", " ")
    # убрать zero-width символы
    for z in _ZW_CHARS:
        s = s.replace(z, "")
    # убрать управляющие символы (кроме \n)
    s = "".join(ch for ch in s if (ch == "\n") or unicodedata.category(ch)[0] != "C")
    # схлопнуть пробелы/переводы строк -> один пробел
    s = re.sub(r"\s+", " ", s).strip()
    return s

def parse_md_chunks(filepath):
    with open(filepath, encoding="utf-8") as f:
        content = f.read()

    parts = re.split(r'^\s*---\s*$', content, flags=re.MULTILINE)
    docs, idx = [], 0
    for chunk in parts:
        chunk = chunk.strip()
        if not chunk:
            continue
        m = re.match(r'^\s*#\s*(.+?)\s*(?:\r?\n)+(.*)\Z', chunk, flags=re.DOTALL)
        if m:
            title = normalize_text(m.group(1))
            body  = normalize_text(m.group(2))
        else:
            title = f"chunk_{idx+1:03}"
            body  = normalize_text(chunk)
        idx += 1
        docs.append({"id": f"chunk_{idx:03}", "title": title, "body": body})
    return docs

docs = parse_md_chunks("data.md")

# --- Готовим тексты с E5-специфичными префиксами для кросс-языкового поиска ---
titles_prepared = [f"title: {d['title']}" for d in docs]  # префикс для заголовков
bodies_prepared = [f"text: {d['body']}" for d in docs]    # префикс для основного текста

# эмбеддинги (нормализация косинуса)
E_title = model.encode(titles_prepared, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
E_body  = model.encode(bodies_prepared, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

E_title = np.asarray(E_title, dtype="float32")
E_body  = np.asarray(E_body,  dtype="float32")
assert E_title.shape == E_body.shape
dim = E_title.shape[1]

idx_title = faiss.IndexFlatIP(dim)
idx_body  = faiss.IndexFlatIP(dim)
idx_title.add(E_title)
idx_body.add(E_body)

faiss.write_index(idx_title, "smartwop_title.faiss")
faiss.write_index(idx_body,  "smartwop_body.faiss")
with open("smartwop_docs_meta.json", "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print(f"Готово: {idx_title.ntotal} title-векторов, {idx_body.ntotal} body-векторов")

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Готово: 259 title-векторов, 259 body-векторов
