In [None]:
%pip install -q chroma sentence_transformers ipywidgets pymupdf4llm pypandoc-binary

In [10]:
import logging
import os
import re
import subprocess
import uuid
from typing import Dict, List

import pymupdf4llm
from chromadb import ClientAPI, HttpClient
from sentence_transformers import SentenceTransformer

In [15]:
def office_to_pdf(file_path: str) -> str:
    output_dir = os.path.dirname(file_path)
    base_name = os.path.basename(file_path)
    pdf_file = os.path.join(output_dir, os.path.splitext(base_name)[0] + ".pdf")
    subprocess.run(
        [
            "libreoffice",
            "--headless",
            "--convert-to",
            "pdf",
            file_path,
            "--outdir",
            output_dir,
        ],
        capture_output=True,
        text=True,
    )
    return pdf_file


def convert_to_markdown(file_path: str) -> Dict[str, str]:
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    metadata = {
        "original_file": file_path,
        "file_type": ext[1:],  # Убираем точку из расширения
        "conversion_method": "",
    }

    converters = {
        (".doc", ".docx", ".rtf"): lambda f: (
            pymupdf4llm.to_markdown(
                office_to_pdf(f),
                write_images=False,
                embed_images=False,
                graphics_limit=None,
                margins=(0, 0, 0, 0),
                table_strategy="lines_strict",
                fontsize_limit=1,
                ignore_code=True,
                show_progress=False,
            ),
            "office_to_pdf",
        ),
        ".pdf": lambda f: (
            pymupdf4llm.to_markdown(
                f,
                write_images=False,
                embed_images=False,
                graphics_limit=None,
                margins=(0, 0, 0, 0),
                table_strategy="lines_strict",
                fontsize_limit=1,
                ignore_code=True,
                show_progress=False,
            ),
            "direct_pdf",
        ),
    }

    for extensions, converter in converters.items():
        if ext in extensions:
            content, method = converter(file_path)
            metadata["conversion_method"] = method
            return {"content": content, "metadata": metadata}

    raise ValueError(f"Unsupported file extension: {ext}")

def preprocess_markdown(markdown_content: str) -> str:
    clean_content = markdown_content.lower()
    clean_content = re.sub(r'<!--.*?-->', '', clean_content, flags=re.DOTALL)
    clean_content = re.sub(r'[^\w\s.,;:?!-]', ' ', clean_content)
    clean_content = re.sub(r'\s+', ' ', clean_content).strip()
    return clean_content

def split_into_chunks(clean_content: str, chunk_size: int = 500) -> List[Dict[str, str]]:
    sentences = re.split(r'(?<=[.!?])\s+', clean_content)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        if current_length + len(sentence) > chunk_size and current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append({
                "text": chunk_text,
                "start_index": clean_content.index(chunk_text),
                "end_index": clean_content.index(chunk_text) + len(chunk_text)
            })
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += len(sentence)

    if current_chunk:
        chunk_text = ' '.join(current_chunk)
        chunks.append({
            "text": chunk_text,
            "start_index": clean_content.index(chunk_text),
            "end_index": clean_content.index(chunk_text) + len(chunk_text)
        })

    return chunks

def create_embeddings(chunks: List[Dict[str, str]], model_name: str = 'cointegrated/LaBSE-en-ru', batch_size: int = 8) -> List[List[float]]:
    model = SentenceTransformer(model_name)

    embeddings = []

    for start in range(0, len(chunks), batch_size):
        end = start + batch_size
        batch_chunks = [chunk["text"] for chunk in chunks[start:end]]
        batch_embeddings = model.encode(batch_chunks, batch_size=batch_size, show_progress_bar=True)
        embeddings.extend(batch_embeddings)

    return embeddings

def save_to_chroma(embeddings: List[List[float]], chunks: List[Dict[str, str]], metadata: Dict[str, str], chroma_client: ClientAPI):
    collection = chroma_client.get_or_create_collection("my_collection")
    for embedding, chunk in zip(embeddings, chunks):
        doc_id = str(uuid.uuid4())
        chunk_metadata = {
            **metadata,
            "chunk_start": chunk["start_index"],
            "chunk_end": chunk["end_index"],
            "text": chunk["text"]
        }
        collection.add(
            ids=[doc_id],
            documents=[chunk["text"]],
            embeddings=[embedding],
            metadatas=[chunk_metadata]
        )

def process_document(file_path: str, chroma_client: ClientAPI):
    conversion_result = convert_to_markdown(file_path)
    markdown_content = conversion_result["content"]
    metadata = conversion_result["metadata"]

    clean_content = preprocess_markdown(markdown_content)
    chunks = split_into_chunks(clean_content)
    embeddings = create_embeddings(chunks)
    save_to_chroma(embeddings, chunks, metadata, chroma_client)

In [None]:
chroma_client = HttpClient(host='localhost', port=8000)
file_paths = ['./data/public/Готовность ОПОП 2017.docx', './data/public/Карты_компетенций ЗФ китайский.doc']
process_documents(file_paths, chroma_client)

# Получение коллекции и просмотр количества документов
collection = chroma_client.get_collection("my_collection")
results = collection.query(query_texts=[], n_results=5, include=["documents", "metadatas"])
logging.info(f"Number of documents in collection: {len(results['ids'])}")