In [3]:
# build_index.py (Windows-safe)
import json
from pathlib import Path
from chromadb import PersistentClient
from chromadb.utils import embedding_functions

DATA_PATH = Path("../data") / "facts.jsonl"
DB_DIR    = Path(r"C:\Users\yifan\OneDrive\桌面\yifanbot\data\db_chroma")
COLL_NAME = "facts"

def load_jsonl(path: Path):
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def main():
    if not DATA_PATH.exists():
        raise FileNotFoundError(f"Missing {DATA_PATH.resolve()}")

    DB_DIR.mkdir(parents=True, exist_ok=True)

    ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    client = PersistentClient(path=str(DB_DIR))
    coll = client.get_or_create_collection(name=COLL_NAME, embedding_function=ef)

    texts, ids, metadatas = [], [], []
    for i, row in enumerate(load_jsonl(DATA_PATH)):
        rid = str(row.get("id") or row.get("uuid") or i)
        txt = row["text"]
        meta = {k: v for k, v in row.items() if k != "text"}
        texts.append(txt); ids.append(rid); metadatas.append(meta)

    coll.upsert(documents=texts, ids=ids, metadatas=metadatas)
    print(f"Upserted {len(ids)} docs into '{COLL_NAME}' → {DB_DIR.resolve()}")

if __name__ == "__main__":
    main()


Upserted 7 docs into 'facts' → C:\Users\yifan\OneDrive\桌面\yifanbot\data\db_chroma


In [4]:
# quick_test.py (Windows-safe)
from pathlib import Path
from chromadb import PersistentClient
from chromadb.utils import embedding_functions

DB_DIR    = Path("db_chroma")
COLL_NAME = "facts"

def main():
    ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    client = PersistentClient(path=str(DB_DIR))
    coll = client.get_collection(name=COLL_NAME, embedding_function=ef)

    print("Count:", coll.count())

    queries = [
        "How many cats do i have",
        "Tell me about MicroTune.",
        "Which languages do I speak?"
    ]
    for q in queries:
        res = coll.query(query_texts=[q], n_results=3)
        print("\nQ:", q)
        for i, (doc, mid) in enumerate(zip(res["documents"][0], res["metadatas"][0]), 1):
            preview = (doc[:120] + "...") if len(doc) > 120 else doc
            print(f"  {i}. {preview}")
            if mid: print("     meta:", mid)

if __name__ == "__main__":
    main()


Count: 7

Q: How many cats do i have
  1. I have two cats and no dog
     meta: {'id': 'fact_005'}
  2. One of my cats is called Tuantuan, the other one is Jinbu
     meta: {'id': 'fact_004'}
  3. I speak Chinese natively, and I am fluent in English and French.
     meta: {'id': 'fact_003'}

Q: Tell me about MicroTune.
  1. I developed MicroTune, an RL-based dynamic RAM allocation system for MariaDB.
     meta: {'id': 'fact_002'}
  2. I am a CIFRE PhD student in Computer Science specializing in DBMS tuning and AI at University of Lille in France.
     meta: {'id': 'fact_001'}
  3. I speak Chinese natively, and I am fluent in English and French.
     meta: {'id': 'fact_003'}

Q: Which languages do I speak?
  1. I speak Chinese natively, and I am fluent in English and French.
     meta: {'id': 'fact_003'}
  2. I'm married, my wife is called Yuanyuan
     meta: {'id': 'fact_007'}
  3. I am a CIFRE PhD student in Computer Science specializing in DBMS tuning and AI at University of Lille in