# Med-Diagnosis: Embedding Ingestion

**Input:** pre-chunked documents in LangChain format (JSON)  
```json
[
  {"page_content": "...", "metadata": {"source": "file.pdf", ...}},
  ...
]
```

**Outputs:**
- `corpus.json` → copy to `./backend/data/corpus.json`
- `chroma_export.zip` → extract as `./chroma_data/` in project root

On next app startup, `corpus_loader.py` reads `corpus.json` for SQL records  
and skips Chroma embedding (vectors already present).

In [None]:
!pip install -q chromadb>=0.6.3 langchain-chroma>=0.0.5 langchain-huggingface langchain-openai sentence-transformers

In [None]:
# ── CONFIG ────────────────────────────────────────────────────────────────

# Path to your pre-chunked JSON file (LangChain format)
INPUT_JSON = "/content/chunks.json"

# Outputs
CHROMA_PATH = "/content/chroma_data"   # → ./chroma_data/ in project
CORPUS_JSON = "/content/corpus.json"   # → ./backend/data/corpus.json

# Must match CHROMA_COLLECTION_NAME in .env (default: "documents")
COLLECTION_NAME = "documents"

# ── Embeddings ──
# "huggingface" : local GPU inference (must use the SAME model as TEI)
# "openai"      : OpenAI-compatible API
EMBEDDINGS_MODE  = "huggingface"
EMBEDDINGS_MODEL = "google/embeddinggemma-300m"  # must match TEI MODEL_ID
HF_TOKEN         = "your-hf-token-here"

# OpenAI (only if EMBEDDINGS_MODE="openai")
OPENAI_API_KEY   = ""
OPENAI_API_BASE  = "https://api.openai.com/v1"
OPENAI_EMB_MODEL = "text-embedding-ada-002"

In [None]:
# ── Imports ───────────────────────────────────────────────────────────────
import hashlib, json, os, shutil
import chromadb
from langchain_chroma import Chroma
from langchain_core.documents import Document as LangchainDocument

os.makedirs(CHROMA_PATH, exist_ok=True)

In [None]:
# ── Load pre-chunked data (supports .json and .jsonl) ────────────────────
ext = os.path.splitext(INPUT_JSON)[1].lower()
with open(INPUT_JSON, encoding="utf-8") as f:
    if ext == ".jsonl":
        raw: list[dict] = [json.loads(line) for line in f if line.strip()]
    else:
        raw: list[dict] = json.load(f)

print(f"Loaded {len(raw)} chunks from {INPUT_JSON}")
print("Sample:", raw[0]["page_content"][:120], "...")

In [None]:
# ── Assign chunk IDs (must match corpus_loader.py formula) ───────────────
# chunk_id = SHA256(f"{source_file_name}:{page_content}")
for chunk in raw:
    page_content = chunk["page_content"]
    file_name    = chunk.get("metadata", {}).get("source", "corpus")
    chunk["_id"] = hashlib.sha256(f"{file_name}:{page_content}".encode()).hexdigest()

print("Chunk IDs assigned")

In [None]:
# ── Embeddings model ──────────────────────────────────────────────────────
if EMBEDDINGS_MODE == "huggingface":
    import torch
    from langchain_huggingface import HuggingFaceEmbeddings
    os.environ["HF_TOKEN"] = HF_TOKEN
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device: {device}")
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDINGS_MODEL,
        model_kwargs={"device": device, "token": HF_TOKEN},
        encode_kwargs={"normalize_embeddings": True},
    )
elif EMBEDDINGS_MODE == "openai":
    from langchain_openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings(
        api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE, model=OPENAI_EMB_MODEL,
    )
else:
    raise ValueError(f"Unknown EMBEDDINGS_MODE: {EMBEDDINGS_MODE}")

print(f"Embedding dim: {len(embeddings.embed_query('test'))}")

In [None]:
# ── Chroma setup ──────────────────────────────────────────────────────────
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
vector_store  = Chroma(
    client=chroma_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
)
existing_ids = set(vector_store._collection.get()["ids"])
print(f"Collection '{COLLECTION_NAME}': {len(existing_ids)} existing vectors")

In [None]:
# ── Embed & store (skips already-present chunks) ──────────────────────────
to_embed = [
    LangchainDocument(
        page_content=c["page_content"],
        metadata=c.get("metadata", {}),
        id=c["_id"],
    )
    for c in raw if c["_id"] not in existing_ids
]
print(f"{len(to_embed)} new chunks to embed  ({len(raw) - len(to_embed)} already present)")

BATCH = 100
for i in range(0, len(to_embed), BATCH):
    batch = to_embed[i:i+BATCH]
    vector_store.add_documents(batch, ids=[d.id for d in batch])
    print(f"  {min(i+BATCH, len(to_embed))}/{len(to_embed)}")

print(f"Done. Collection total: {vector_store._collection.count()} vectors")

In [None]:
# ── Save corpus.json (strip internal _id key) ─────────────────────────────
corpus = [{"page_content": c["page_content"], "metadata": c.get("metadata", {})} for c in raw]
with open(CORPUS_JSON, "w", encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)
print(f"Saved {len(corpus)} chunks → {CORPUS_JSON}")

In [None]:
# ── Package & download ────────────────────────────────────────────────────
shutil.make_archive("/content/chroma_export", "zip", "/content", "chroma_data")
print("Zipped → /content/chroma_export.zip")

try:
    from google.colab import files
    files.download(CORPUS_JSON)
    files.download("/content/chroma_export.zip")
except ImportError:
    print(f"Files ready:\n  {CORPUS_JSON}\n  /content/chroma_export.zip")

## Deploy
```bash
cp corpus.json ./backend/data/corpus.json
unzip -o chroma_export.zip -d .
docker compose -f docker-compose.dev.cpu.yml up -d --build
```