In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?

In [None]:
# ingest.py
import os
import json
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_json_knowledge_base(filepath="knowledge_base.json"):
    """
    Loads pre-chunked documents with metadata from a single JSON file.
    The JSON file should be a list of objects, where each object has:
    - "chunk_id": A unique string identifier.
    - "chunk_text": The text content of the chunk.
    - "metadata": A dictionary of metadata.
    """
    print(f"Loading knowledge from JSON file: '{filepath}'...")
    if not os.path.exists(filepath):
        print(f"Error: The file '{filepath}' was not found. Please create it.")
        return []

    try:
        # Ensure the file is read with UTF-8 encoding for Farsi characters
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded {len(data)} knowledge chunks.")
        return data
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {filepath}. Please check the file format. Details: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred while reading {filepath}: {e}")
        return []

In [None]:
def run():
    """
    Main function to ingest documents from JSON and store them in ChromaDB.
    """
    print("--- Starting Knowledge Base Ingestion from JSON ---")

    # 1. Load documents from JSON. This data is already chunked.
    documents_data = load_json_knowledge_base()
    if not documents_data:
        print("No documents to ingest. Exiting.")
        return

    # Extract the necessary components for ChromaDB
    texts_to_embed = [doc["chunk_text"] for doc in documents_data]
    document_ids = [doc["chunk_id"] for doc in documents_data]
    metadatas = [doc["metadata"] for doc in documents_data]

    # --- VERIFICATION STEP 1: Inspect the loaded data ---
    print("\n--- Verification: Inspecting first loaded chunk ---")
    if documents_data:
        print(f"Chunk ID: {document_ids[0]}")
        print(f"Chunk Text: \"{texts_to_embed[0][:150]}...\"")
        print(f"Chunk Metadata: {metadatas[0]}")
    print("------------------------------------------------\n")

    # 2. Initialize the embedding model
    print("Initializing embedding model (this may take a moment)...")
    model_name = "paraphrase-multilingual-mpnet-base-v2"
    embedding_model = SentenceTransformer(model_name)
    print("Embedding model initialized.")

    # 3. Setup ChromaDB client
    client = chromadb.PersistentClient(path="./chroma_db")

    # 4. Create or get the collection
    collection_name = "farsi_rag_collection"
    print(f"Setting up ChromaDB collection: '{collection_name}'...")
    # Delete the collection if it already exists to ensure a fresh start
    if collection_name in [c.name for c in client.list_collections()]:
        client.delete_collection(name=collection_name)
        print(f"Deleted existing collection '{collection_name}' for a fresh start.")
    collection = client.get_or_create_collection(name=collection_name)
    print("Collection setup complete.")

    # 5. Generate embeddings for all document chunks
    print("Generating embeddings for all document chunks...")
    embeddings = embedding_model.encode(texts_to_embed, show_progress_bar=True)

    # --- VERIFICATION STEP 2: Inspect the embeddings ---
    print("\n--- Verification: Inspecting embeddings ---")
    print(f"Shape of embeddings array: {embeddings.shape}")
    print(f"Sample of first embedding vector: {embeddings[0][:5]}...")
    print("------------------------------------------\n")

    # 6. Add to ChromaDB collection (NOW WITH METADATA!)
    print("Adding documents with their metadata to ChromaDB...")
    collection.add(
        embeddings=embeddings.tolist(),
        documents=texts_to_embed,
        metadatas=metadatas,  # <-- Here we add the rich metadata
        ids=document_ids
    )

    # --- VERIFICATION STEP 3: Test a query ---
    print("\n--- Verification: Performing a test query ---")
    test_query = "خدمات شرکت چیست؟"
    print(f"Querying with text: \"{test_query}\"")

    test_query_embedding = embedding_model.encode(test_query).tolist()

    results = collection.query(
        query_embeddings=[test_query_embedding],
        n_results=1
    )

    retrieved_doc = results['documents'][0][0]
    retrieved_meta = results['metadatas'][0][0]
    print(f"Retrieved document: \"{retrieved_doc[:150]}...\"")
    print(f"Retrieved metadata: {retrieved_meta}")
    print("--------------------------------------------\n")

    print("\n--- Ingestion Complete ---")
    print(f"Successfully added {len(documents_data)} documents to the '{collection_name}' collection.")
    print("You can now run 'app.py' to start the chatbot.")

In [None]:
run()

--- Starting Knowledge Base Ingestion from JSON ---
Loading knowledge from JSON file: 'knowledge_base.json'...
Successfully loaded 57 knowledge chunks.

--- Verification: Inspecting first loaded chunk ---
Chunk ID: contact_us_001
Chunk Text: "راه‌های تماس با ما به شرح زیر است:
- تلفن همراه: ۰۹۹-۸۱۸۱-۸۹۷۸
- تلفن ثابت: ۰۱۱-۴۲۴۳-۳۴۴۳
- فکس: ۰۱۱-۴۲۴۳-۳۴۴۴
- ایمیل: info@tabenergy.ir
- آدرس: مازن..."
Chunk Metadata: {'source_title': 'تماس با ما', 'source_url': 'tabenergy.ir/contact/'}
------------------------------------------------

Initializing embedding model (this may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model initialized.
Setting up ChromaDB collection: 'farsi_rag_collection'...
Collection setup complete.
Generating embeddings for all document chunks...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]


--- Verification: Inspecting embeddings ---
Shape of embeddings array: (57, 768)
Sample of first embedding vector: [-0.05331298 -0.04908168 -0.01405383 -0.02830934  0.12919362]...
------------------------------------------

Adding documents with their metadata to ChromaDB...

--- Verification: Performing a test query ---
Querying with text: "خدمات شرکت چیست؟"
Retrieved document: "خدمات شرکت تابان انرژی در حوزه سیستم‌های هیبرید شامل موارد زیر است:
- مشاوره و همراهی انرژی در محل
- طراحی بر اساس استانداردهای معتبر
- تامین تجهیزات ..."
Retrieved metadata: {'source_title': 'اجرای سیستم های خورشیدی ترکیبی و هیبرید', 'parent_category': 'خدمات', 'source_url': 'tabenergy.ir/خدمات/774/'}
--------------------------------------------


--- Ingestion Complete ---
Successfully added 57 documents to the 'farsi_rag_collection' collection.
You can now run 'app.py' to start the chatbot.
