In [6]:
# Install Required Packages
!pip install google-generativeai chromadb



In [7]:
# --- IMPORTS AND CONFIGURATION ---
import json
import os
import sys
import time
import glob
import chromadb
import google.generativeai as genai
from google.api_core.exceptions import GoogleAPICallError

# Directory containing all knowledge base JSON files
KNOWLEDGE_BASE_DIR = '../phase_1_knowledge_base/knowledge_base/'

# The directory where the vector database will be stored.
CHROMA_PATH = 'chroma_db'

# The Gemini model used to create the embeddings.
EMBEDDING_MODEL_NAME = 'models/embedding-001'

# The name of the collection within the ChromaDB database.
COLLECTION_NAME = 'pet_health_kb'

print("Configuration loaded.")

Configuration loaded.


In [8]:
# --- CONFIGURE GEMINI API ---
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    # A simple way to prompt for the key if it's not set as an environment variable
    # Note: In a shared or production environment, using environment variables is more secure.
    try:
        from google.colab import userdata
        api_key = userdata.get('GOOGLE_API_KEY')
        print("API Key loaded from Colab secrets.")
    except (ImportError, KeyError):
        api_key = input("Please enter your Google API Key: ")
        
genai.configure(api_key=api_key)

print(f"Gemini API configured using model: {EMBEDDING_MODEL_NAME}")

Gemini API configured using model: models/embedding-001


In [9]:
# --- HELPER FUNCTIONS ---

def load_and_combine_documents(directory):
    """Loads and combines all JSON documents from a directory."""
    all_docs = []
    json_files = glob.glob(os.path.join(directory, '*.json'))
    print(f"Found {len(json_files)} JSON files in {directory}.")
    for file_path in json_files:
        print(f"  Loading: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                docs = json.load(f)
                if isinstance(docs, list):
                    all_docs.extend(docs)
                elif isinstance(docs, dict):
                    all_docs.append(docs)
        except Exception as e:
            print(f"    Error loading {file_path}: {e}")
    print(f"Total combined documents: {len(all_docs)}")
    return all_docs

def get_embedding(text, model_name):
    """Generates an embedding for a given text using the Gemini API."""
    try:
        return genai.embed_content(model=model_name, content=text)["embedding"]
    except Exception as e:
        print(f"Error embedding text: '{text[:50]}...'. Error: {e}")
        return None

print("Helper functions defined.")

Helper functions defined.


In [10]:
# --- MAIN SCRIPT LOGIC ---
print("--- Starting Phase 2: Building the Vector Database ---")

# 1. Load and combine all knowledge base documents
all_documents = load_and_combine_documents(KNOWLEDGE_BASE_DIR)
print(f"Loaded {len(all_documents)} document chunks from all JSON files.")

# 2. Initialize ChromaDB client and collection
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"} # Using cosine distance is good for semantic search
)
print(f"ChromaDB collection '{COLLECTION_NAME}' is ready.")
print(f"Database will be saved in: {os.path.abspath(CHROMA_PATH)}")

# 3. Embed and store each document
print("\nProcessing and embedding documents...")
count = 0
for i, doc in enumerate(all_documents):
    chunk_text = doc.get('chunk')
    source_url = doc.get('source')
    doc_id = str(i)
    # Check if document already exists to avoid re-processing
    if len(collection.get(ids=[doc_id])['ids']) > 0:
        print(f"  Document ID {doc_id} already exists. Skipping.")
        continue
    if not chunk_text:
        print(f"  Skipping document {i} due to missing 'chunk' field.")
        continue
    embedding = get_embedding(chunk_text, EMBEDDING_MODEL_NAME)
    if embedding:
        collection.add(
            embeddings=[embedding],
            documents=[chunk_text],
            metadatas=[{"source": source_url}],
            ids=[doc_id]
        )
        count += 1
        print(f"  -> Added document {i+1}/{len(all_documents)} (ID: {doc_id})")
        time.sleep(0.1) # Small delay to be kind to the API
    else:
        print(f"  -> FAILED to process document {i+1}. Skipping.")

print("\n--- Phase 2 Complete ---")
print(f"Successfully added {count} new documents to the '{COLLECTION_NAME}' collection.")
print(f"Total documents in collection: {collection.count()}")

--- Starting Phase 2: Building the Vector Database ---
Found 3 JSON files in ../phase_1_knowledge_base/knowledge_base/.
  Loading: ../phase_1_knowledge_base/knowledge_base\final_pet_health_kb.json
  Loading: ../phase_1_knowledge_base/knowledge_base\pet_health_data.json
  Loading: ../phase_1_knowledge_base/knowledge_base\universal_pet_health_data.json
Total combined documents: 252
Loaded 252 document chunks from all JSON files.
ChromaDB collection 'pet_health_kb' is ready.
Database will be saved in: d:\mini project\pet_care_chatbot\phase_2_the_brain\chroma_db

Processing and embedding documents...
  Document ID 0 already exists. Skipping.
  Document ID 1 already exists. Skipping.
  Document ID 2 already exists. Skipping.
  Document ID 3 already exists. Skipping.
  Document ID 4 already exists. Skipping.
  Document ID 5 already exists. Skipping.
  Document ID 6 already exists. Skipping.
  Document ID 7 already exists. Skipping.
  Document ID 8 already exists. Skipping.
  Document ID 9 al

In [13]:
# Vector Database Build Complete

'''All knowledge base JSON files have been combined and embedded into the ChromaDB vector database.

**Next steps:**
- You can now use this vector database for semantic search or chatbot applications.
- Proceed to the next phase for retrieval or inference tasks.'''

'All knowledge base JSON files have been combined and embedded into the ChromaDB vector database.\n\n**Next steps:**\n- You can now use this vector database for semantic search or chatbot applications.\n- Proceed to the next phase for retrieval or inference tasks.'