In [None]:
# ===== Versions pinned to match your notebook (embed) =====
%pip install -q --upgrade pip
%pip uninstall -y -q numpy pandas
%pip install -q \
  "numpy==1.26.4" \
  "pandas==2.2.2" \
  "transformers==4.40.2" \
  "sentence-transformers==2.7.0" \
  "accelerate==0.30.1" \
  "chromadb==1.0.20" \
  bitsandbytes \
  tqdm

# Web server bits
%pip install -q fastapi uvicorn nest_asyncio pyngrok

# ---- Hard restart so pinned wheels are actually used (Colab-specific) ----
import os
print("🔁 Restarting runtime to load pinned wheels ...")
os.kill(os.getpid(), 9)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [None]:
from google.colab import userdata

try:
    AUTH_TOKEN = userdata.get('EMBED_SERVER_TOKEN')
    print("Token from userdata:", AUTH_TOKEN)
except userdata.SecretNotFoundError:
    print("Secret not found. Did you add EMBED_SERVER_TOKEN in the sidebar?")

Token from userdata: f0ab64e29211a096d901e171568e6a3d


In [None]:
# =========================
# Colab Retrieval/Embedding Server (uses DB_DIR)
# =========================
%pip install -q fastapi uvicorn nest_asyncio

import os, torch
from sentence_transformers import SentenceTransformer, models
from huggingface_hub import snapshot_download
from transformers import BitsAndBytesConfig  # 8-bit quant
from google.colab import drive
import chromadb
from chromadb.config import Settings

# ---- Config ----
HF_REPO_ID = "Nimiii/nv-embedcode-7b-mine-modder-st"
CACHE_DIR  = "/content/drive/MyDrive/models/nv-embedcode-7b-mine-modder-st"
DB_DIR     = "/content/drive/MyDrive/chroma_db_custom_model"  # <- points to your persisted store
COLLECTION = os.environ.get("CHROMA_COLLECTION", "minecraft_mods_custom_v1")
MAX_LEN    = 2048

# Reuse the token you loaded in Cell 2
if not AUTH_TOKEN:
    raise RuntimeError("EMBED_SERVER_TOKEN not found (Colab Secrets).")

# ---- Mount Drive & cache model ----
if not os.path.isdir("/content/drive"):
    drive.mount("/content/drive")

if not os.path.isdir(CACHE_DIR):
    snapshot_download(repo_id=HF_REPO_ID, local_dir=CACHE_DIR, local_dir_use_symlinks=False)

# ---- Load ST model (8-bit backbone, pooled + normalized) ----
device  = "cuda" if torch.cuda.is_available() else "cpu"
bnb_cfg = BitsAndBytesConfig(load_in_8bit=True)

word_embedding_model = models.Transformer(
    CACHE_DIR,
    model_args={
        "trust_remote_code": True,
        "device_map": "auto",
        "quantization_config": bnb_cfg,
    },
)
pooling_model   = models.Pooling(word_embedding_model.get_word_embedding_dimension())
normalize_model = models.Normalize()

st_model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model, normalize_model],
    device=device,
)
st_model.max_seq_length = MAX_LEN
print(f"✅ Embed model ready on: {device}")

# ---- Chroma client (points to Drive; pinned to 0.4.24 in Cell 1) ----
os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
client = chromadb.PersistentClient(
    path=DB_DIR,
    settings=Settings(anonymized_telemetry=False),
)

# Require an existing collection (avoid silently creating an empty one)
names = [c.name for c in client.list_collections()]
if COLLECTION not in names:
    raise RuntimeError(
        f"Chroma collection '{COLLECTION}' not found in DB_DIR={DB_DIR}. "
        f"Available: {names}. Re-ingest or set CHROMA_COLLECTION env var."
    )
collection = client.get_collection(COLLECTION)
print(f"✅ Using Chroma collection: {COLLECTION} @ {DB_DIR}")


Mounted at /content/drive


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Embed model ready on: cuda
✅ Using Chroma collection: minecraft_mods_custom_v1 @ /content/drive/MyDrive/chroma_db_custom_model


In [None]:
# ---- API schema & app ----
import time, nest_asyncio, uvicorn
from typing import List, Optional, Literal
from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel, Field
from google.colab import userdata
from pyngrok import ngrok, conf
import torch

# Assume st_model and collection are initialized elsewhere in your full script
# from sentence_transformers import SentenceTransformer
# import chromadb
# st_model = SentenceTransformer(...)
# client = chromadb.Client(...)
# collection = client.get_collection(...)
# AUTH_TOKEN = "your_auth_token"


class EmbedRequest(BaseModel):
    texts: list[str]
    normalize: bool = True
    precision: str = "float32"  # accepted but ignored by SentenceTransformer

class Health(BaseModel):
    ok: bool
    collection: str
    count: int
    device: str
    dim: int
    chroma_version: str

class RetrieveRequest(BaseModel):
    query: str = Field(..., description="Natural language query")
    top_k: int = Field(8, ge=1, le=100)
    include: List[Literal["documents","metadatas","distances","uris","data"]] = Field(
        default_factory=lambda: ["documents","metadatas","distances"]
    )
    normalize: bool = True
    filters: Optional[dict] = None          # maps to Chroma `where`
    where_document: Optional[dict] = None   # maps to Chroma `where_document`
    expand_pages_from_top_k: bool = False   # expand by source_file
    similarity_threshold: Optional[float] = None  # applies only to the top_k band

class Hit(BaseModel):
    id: str
    distance: Optional[float] = None
    similarity: Optional[float] = None
    document: Optional[str] = None
    metadata: Optional[dict] = None

class RetrieveResponse(BaseModel):
    hits: List[Hit]
    applied_top_k: int
    include: List[str]
    stats: dict

app = FastAPI()

def _check_auth(h: Optional[str]):
    # In a real app, replace with your actual auth token logic
    if not AUTH_TOKEN or h != f"Bearer {AUTH_TOKEN}":
        raise HTTPException(status_code=401, detail="Unauthorized")

# --- helper to convert L2 distance between unit vectors to similarity in [0,1] ---
def _l2_to_sim01(d: Optional[float]) -> Optional[float]:
    if d is None:
        return None
    d = float(d)
    cos = 1.0 - (d * d) / 2.0           # cosine in [-1, 1]
    cos = max(-1.0, min(1.0, cos))
    return (cos + 1.0) / 2.0            # similarity in [0, 1]

@app.get("/health", response_model=Health)
def health(authorization: str = Header(None)):
    _check_auth(authorization)
    try:
        cnt = collection.count()
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Chroma error: {e}")
    try:
        dim = st_model.get_sentence_embedding_dimension()
    except Exception:
        dim = len(st_model.encode(["x"], normalize_embeddings=True)[0])
    import chromadb as _c
    return Health(
        ok=True,
        collection=COLLECTION,
        count=cnt,
        device=("cuda" if torch.cuda.is_available() else "cpu"),
        dim=int(dim),
        chroma_version=getattr(_c, "__version__", "unknown"),
    )

@app.post("/embed")
def embed(req: EmbedRequest, authorization: str = Header(None)):
    _check_auth(authorization)
    with torch.no_grad():
        vecs = st_model.encode(
            req.texts,
            convert_to_tensor=True,
            normalize_embeddings=req.normalize,
            show_progress_bar=False,
        ).float().cpu().tolist()
    return {"vectors": vecs, "dim": (len(vecs[0]) if vecs else 0)}

@app.post("/retrieve", response_model=RetrieveResponse)
def retrieve(req: RetrieveRequest, authorization: str = Header(None)):
    _check_auth(authorization)

    t0 = time.perf_counter()
    with torch.no_grad():
        q_emb = st_model.encode(
            [req.query],
            convert_to_tensor=True,
            normalize_embeddings=req.normalize,
            show_progress_bar=False,
        ).float().cpu().tolist()
    t_embed = (time.perf_counter() - t0) * 1000

    t1 = time.perf_counter()
    try:
        res = collection.query(
            query_embeddings=q_emb,
            n_results=req.top_k,
            include=req.include,
            where=req.filters,
            where_document=req.where_document
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Chroma query failed: {e}")
    t_query = (time.perf_counter() - t1) * 1000

    ids       = res.get("ids", [[]])[0]
    distances = res.get("distances", [[]])[0] if "distances" in res else [None]*len(ids)
    documents = res.get("documents", [[]])[0] if "documents" in res else [None]*len(ids)
    metadatas = res.get("metadatas", [[]])[0] if "metadatas" in res else [None]*len(ids)

    # 1. Build initial top-k hits and apply similarity threshold
    top_hits: List[Hit] = []
    for i, _id in enumerate(ids):
        dist = float(distances[i]) if distances[i] is not None else None
        sim = _l2_to_sim01(dist)

        if req.similarity_threshold is None or (sim is not None and sim >= req.similarity_threshold):
            top_hits.append(Hit(
                id=_id,
                distance=dist,
                similarity=sim,
                document=documents[i] if i < len(documents) else None,
                metadata=metadatas[i] if i < len(metadatas) else None,
            ))

    # 2. If expansion is requested, get all pages from the same source_file
    all_hits = list(top_hits)
    if req.expand_pages_from_top_k and top_hits:
        source_files = list(dict.fromkeys(
            hit.metadata.get("source_file") for hit in top_hits if hit.metadata and hit.metadata.get("source_file")
        ))

        if source_files:
            expansion_where = {"source_file": {"$in": source_files}}
            if req.filters:
                expansion_where = {"$and": [req.filters, expansion_where]}

            try:
                page_res = collection.get(
                    where=expansion_where,
                    include=["documents", "metadatas"]
                )

                page_ids = page_res.get("ids", [])
                page_docs = page_res.get("documents", [])
                page_metas = page_res.get("metadatas", [])

                # 3. Add expanded docs, avoiding duplicates
                existing_ids = {h.id for h in all_hits}
                for i, page_id in enumerate(page_ids):
                    if page_id not in existing_ids:
                        meta = page_metas[i] if i < len(page_metas) else {}
                        if meta:
                            meta["_expanded_from_source_file"] = meta.get("source_file")

                        all_hits.append(Hit(
                            id=page_id,
                            distance=None,
                            similarity=None, # Expanded pages have no similarity score
                            document=page_docs[i] if i < len(page_docs) else None,
                            metadata=meta,
                        ))
            except Exception as e:
                # Fail gracefully: log the error but return the top hits found so far
                print(f"Warning: Page expansion query failed: {e}")


    return RetrieveResponse(
        hits=all_hits,
        applied_top_k=len(top_hits), # This count is pre-expansion
        include=req.include,
        stats={
            "embed_ms": round(t_embed, 2),
            "query_ms": round(t_query, 2),
            "total_ms": round((time.perf_counter() - t0) * 1000, 2),
            "normalized": req.normalize,
            "top_k_before_expansion": len(top_hits),
            "total_hits_after_expansion": len(all_hits),
        },
    )

# ---- Expose via ngrok (if running in Colab) ----
nest_asyncio.apply()
NGROK_TOKEN = userdata.get("NGROK_TOKEN")
if not NGROK_TOKEN:
    raise RuntimeError("NGROK_TOKEN not found in Colab Secrets.")
conf.get_default().region = "eu"
ngrok.set_auth_token(NGROK_TOKEN)

public_url = ngrok.connect(8000).public_url
print("🌍 Public URL (ngrok):", public_url)
print("🔐 Send header: Authorization: Bearer <EMBED_SERVER_TOKEN>")

uvicorn.run(app, host="0.0.0.0", port=8000)



INFO:     Started server process [2849]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


🌍 Public URL (ngrok): https://4bb0e08afa86.ngrok-free.app
🔐 Send header: Authorization: Bearer <EMBED_SERVER_TOKEN>
INFO:     85.250.132.139:0 - "POST /retrieve HTTP/1.1" 200 OK
