In [1]:
import os
import json
import time
import hashlib
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Callable, Any
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# document.py
class Document:
    def __init__(self, pageContent: str, metadata: dict, id: str):
        self.pageContent = pageContent  # raw text from pdf
        self.metadata: dict = metadata  # pdf metadata (author, date ...)
        self.id = id  # pdf id

In [None]:
# Configs
THIS = Path.cwd()
STORAGE_DIR = THIS.joinpath("/embedding")
MODEL_PATH = THIS.joinpath("/models", "Q4_K_M.gguf")

In [4]:
class EmbeddingAdapter:
    """Abstract embedding adapter"""

    def embed_texts(self, text: List[str]) -> List[np.ndarray]:
        raise NotImplementedError("Must be implemented by sub-class")
    
class DummyAdapter(EmbeddingAdapter):
    """Deterministic simulated embeddings for testing"""
    def __init__(self, dim: int = 384):
        self.dim = dim

    def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
        out = []
        for text in texts:

            h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
            seed = int(h, 16) % (2**31 - 1)
            rng = np.random.RandomState(seed=seed)
            vec = rng.normal(size=(self.dim,))
            norm = np.linalg.norm(vec)
            if norm > 0:
                vec = vec / norm
            out.append(vec.astype(float))
        return out
    
    class SentenceTransformerAdapter(EmbeddingAdapter):
        """Adapter for using sentence transformer"""
        def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
            self.model = SentenceTransformer(model_name)

        def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
            arr = self.model.encode(texts, show_progress_bar=False)
            if isinstance(arr, np.ndarray) and arr.ndim == 2:
                return [arr[i].astype(float) for i in range(arr.shape[0])]
            return [np.asarray(a, dtype=float) for a in arr]
        
    class LlamaCppAdapter(EmbeddingAdapter):
        """Adapter for using Llama_cpp_python"""

        def __init__(self, model_name: str = "Q4_K_M", model_path = MODEL_PATH):
            NotImplementedError
            

In [8]:
# Utils
def stable_id_from_text(text: str, prefix: str = "doc"):
    pass

def chunked(iterable: List[Any], batch_size: int):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i: i + batch_size]

def ensure_storage_dir():
    STORAGE_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
# Init the embedding model

def initializeEmbeddingModel(use_dummy: bool = True, **kwargs) -> EmbeddingAdapter:
    if use_dummy:
        return DummyAdapter(**kwargs)
    else:
        return NotImplementedError("No Adapters available")

In [11]:
def generate_embeddings(adapter: EmbeddingAdapter, documents: List[Document],
                        batch_size: int = 8,
                        on_progress: Optional[Callable[[int, int], None]] = None) -> List[Dict[str, Any]]:
    
    ensure_storage_dir()
    records: List[Dict[str, Any]] = []
    processed = 0
    total = len(documents)

    for batch in chunked(documents, batch_size):
        texts = [doc.pageContent for doc in batch]
        vectors = adapter.embed_texts(texts)

        for doc, vec in zip(batch, vectors):
            rec_id = doc.id if getattr(doc, "id", None) else stable_id_from_text(doc.pageContent)
            embedding_list = list(map(float, np.asarray(vec).tolist()))
            record = {
                "id": rec_id,
                "content": doc.pageContent,
                "metadata": doc.metadata,
                "embedding": embedding_list,
                "timestamp": int(time.time() * 1000)
            }
            records.append(record)

            processed += 1
            
            if on_progress:
                on_progress(processed, total)

    return records

In [12]:
def save_embeddings_json(embeddings: List[Dict[str, Any]], filename: str = "embeddings_json") -> Dict[str, Any]:
    """Save embeddings to STORAGE_DIR/embeddings_json"""

    ensure_storage_dir()
    filepath = STORAGE_DIR.joinpath(filename)

    data = {
        "version": "1.0",
        "model": "dummy" if isinstance(embeddings, list) and len(embeddings) > 0 else "unknown",
        "dimensions": len(embeddings[0]["embedding"]) if embeddings else None,
        "count": len(embeddings),
        "created": datetime.now(time.time()) + "Z",
        "embeddings": embeddings
    }

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    size = filepath.stat().st_size
    return {"filepath": str(filepath), "size": size}

In [None]:
def load_embeddings_json(filename: str) -> List[Dict[str, Any]]:
    """Load embeddings JSON and return the list at data["embeddings]"""
    filepath