In [None]:
# Known-good combo for Colab + ChromaDB + ST
%pip install -q --upgrade pip
%pip uninstall -y numpy pandas -q
%pip install -q \
  "numpy==1.26.4" \
  "pandas==2.2.2" \
  "chromadb==0.4.24" \
  "transformers==4.40.2" \
  "sentence-transformers==2.7.0" \
  "accelerate==0.30.1" \
  tqdm

# Hard restart to load new wheels
import os
print("🔁 Restarting runtime to load pinned NumPy/pandas ...")
os.kill(os.getpid(), 9)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [1]:
import os, gc
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import chromadb
from google.colab import userdata, drive
from huggingface_hub import login
from sentence_transformers import SentenceTransformer, models
from transformers import AutoModel, AutoTokenizer
from tqdm.auto import tqdm

print("✅ Imports OK")

# ==============================
# Configuration
# ==============================
class CONFIG:
    CSV_PATH = "/content/drive/MyDrive/final_train_data.csv"
    MODEL_CHECKPOINT_PATH = "/content/drive/MyDrive/content_good_run/drive/MyDrive/nv-embed-code-lora-checkpoints/best_model_adapters.pth"
    CHROMA_DB_PATH = "/content/drive/MyDrive/chroma_db_custom_model"
    BASE_MODEL_ID = "nvidia/NV-EmbedCode-7b-v1"
    COLLECTION_NAME = "minecraft_mods_custom_v1"
    EMBEDDING_BATCH_SIZE = 8
    LORA_R = 16
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05
print("✅ Config set")

# ==============================
# Auth & Drive
# ==============================
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("✅ Hugging Face login via secret")
except Exception:
    try:
        from huggingface_hub import notebook_login
        notebook_login()
    except Exception as e:
        print("⚠️ HF login skipped:", e)

if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')
    print("✅ Drive mounted")
else:
    print("✅ Drive already mounted")

# ==============================
# Load CSV
# ==============================
print(f"📄 Loading: {CONFIG.CSV_PATH}")
df = pd.read_csv(CONFIG.CSV_PATH)
# create a stable unique id (object_id + row index)
df["unique_id"] = df["object_id"].astype(str) + "_" + df.index.astype(str)
# drop training-only cols
df_for_ingestion = df.drop(columns=["query", "level"], errors="ignore")
print("✅ Loaded:", df_for_ingestion.shape)


✅ Imports OK
✅ Config set
✅ Hugging Face login via secret
✅ Drive already mounted
📄 Loading: /content/drive/MyDrive/final_train_data.csv
✅ Loaded: (39756, 8)


In [2]:
# ---- LoRA helper modules ----
class LoRALinear(nn.Module):
    def __init__(self, linear_layer, r, alpha, dropout, dtype):
        super().__init__()
        self.in_features, self.out_features = linear_layer.in_features, linear_layer.out_features
        self.weight, self.bias = linear_layer.weight, linear_layer.bias
        self.r, self.alpha = r, alpha
        self.dropout = nn.Dropout(dropout)
        self.lora_A = nn.Parameter(torch.zeros(r, self.in_features, dtype=dtype))
        self.lora_B = nn.Parameter(torch.zeros(self.out_features, r, dtype=dtype))
        nn.init.kaiming_uniform_(self.lora_A, a=np.sqrt(5))
    def forward(self, x):
        result = nn.functional.linear(x, self.weight, self.bias)
        lora_out = nn.functional.linear(nn.functional.linear(self.dropout(x), self.lora_A), self.lora_B)
        return result + lora_out * (self.alpha / self.r)

class LoRAConfig:
    def __init__(self, r, alpha, dropout):
        self.r, self.alpha, self.dropout = r, alpha, dropout
        self.dtype = torch.bfloat16

def apply_lora(model: nn.Module, config: LoRAConfig):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and "lm_head" not in name:
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, LoRALinear(module, config.r, config.alpha, config.dropout, config.dtype))
    return model

class HFBackboneWrapper(nn.Module):
    def __init__(self, hf_model, tokenizer, max_length=2048):
        super().__init__()
        self.hf, self.tokenizer, self.max_seq_length = hf_model, tokenizer, max_length
    def tokenize(self, texts):
        return self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_seq_length, return_tensors="pt")
    def forward(self, features):
        outputs = self.hf(**features, output_hidden_states=True)
        features["token_embeddings"] = outputs.last_hidden_state
        return features

def build_st_model(hf_model, tokenizer):
    backbone = HFBackboneWrapper(hf_model, tokenizer)
    pooling = models.Pooling(hf_model.config.hidden_size)
    normalize = models.Normalize()
    return SentenceTransformer(modules=[backbone, pooling, normalize])

def load_lora_weights(model, checkpoint_path):
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint not found at: {checkpoint_path}")
    ckpt = torch.load(checkpoint_path, map_location="cpu")
    lora_state = ckpt.get("lora_state_dict", ckpt)
    model.load_state_dict(lora_state, strict=False)
    return model

# ---- Load & patch base model ----
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

tokenizer = AutoTokenizer.from_pretrained(CONFIG.BASE_MODEL_ID, trust_remote_code=True)
base_model = AutoModel.from_pretrained(
    CONFIG.BASE_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_cache=False,
)

lora_cfg = LoRAConfig(r=CONFIG.LORA_R, alpha=CONFIG.LORA_ALPHA, dropout=CONFIG.LORA_DROPOUT)
ft_model = apply_lora(base_model, lora_cfg)
ft_model = load_lora_weights(ft_model, CONFIG.MODEL_CHECKPOINT_PATH)
ft_model.to(device).eval()

embedding_model = build_st_model(ft_model, tokenizer)
print("✅ Custom embedding model ready")


Device: cuda




Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

✅ Custom embedding model ready


In [3]:
# Init Chroma persistent DB (fully disable telemetry) + ingest with fp32 embeddings
import os, gc, torch, chromadb, logging
from chromadb.config import Settings
from tqdm.auto import tqdm

# Disable Chroma telemetry (env var + settings + quiet logger)
os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"
logging.getLogger("chromadb.telemetry").setLevel(logging.CRITICAL)

os.makedirs(CONFIG.CHROMA_DB_PATH, exist_ok=True)
client = chromadb.PersistentClient(
    path=CONFIG.CHROMA_DB_PATH,
    settings=Settings(anonymized_telemetry=False)
)

# Create/get collection (cosine)
collection = client.get_or_create_collection(
    name=CONFIG.COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
)

# Prepare batches
metadata_cols = [c for c in df_for_ingestion.columns if c not in ["content", "unique_id"]]
metadatas = df_for_ingestion[metadata_cols].to_dict("records")
ids = df_for_ingestion["unique_id"].tolist()
documents = df_for_ingestion["content"].astype(str).tolist()

print("🔄 Embedding + upserting ...")
for i in tqdm(range(0, len(documents), CONFIG.EMBEDDING_BATCH_SIZE), desc="Batches"):
    b_docs = documents[i:i+CONFIG.EMBEDDING_BATCH_SIZE]
    b_ids  = ids[i:i+CONFIG.EMBEDDING_BATCH_SIZE]
    b_meta = metadatas[i:i+CONFIG.EMBEDDING_BATCH_SIZE]

    # Force float32: encode to tensor, cast -> cpu numpy
    b_emb_t = embedding_model.encode(
        b_docs,
        batch_size=CONFIG.EMBEDDING_BATCH_SIZE,
        convert_to_tensor=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )
    b_emb = b_emb_t.to(torch.float32).cpu().numpy()

    collection.upsert(
        ids=b_ids,
        embeddings=b_emb.tolist(),
        metadatas=b_meta,
        documents=b_docs,  # keep raw text so queries can return it
    )

    # clean up
    del b_emb_t, b_emb, b_docs, b_ids, b_meta
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"✅ Ingestion complete. Count: {collection.count()}")
print("DB path:", CONFIG.CHROMA_DB_PATH, "| Collection:", CONFIG.COLLECTION_NAME)


🔄 Embedding + upserting ...


Batches:   0%|          | 0/4970 [00:00<?, ?it/s]

✅ Ingestion complete. Count: 39756
DB path: /content/drive/MyDrive/chroma_db_custom_model | Collection: minecraft_mods_custom_v1
