In [1]:
!pip install sentence-transformers chromadb langchain langchain-text-splitters




[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# STEP 2: Build Vector Database for Course Materials
import os
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb

# ----------------------------
# Paths
# ----------------------------
TEXT_DIR = "../processed_texts"        # Output from step 1
DB_DIR = "../vectorstore"              # Chroma persistent DB

os.makedirs(DB_DIR, exist_ok=True)

# ----------------------------
# Load embedding model
# ----------------------------
print("Loading embedding model...")
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ----------------------------
# Chunking setup
# ----------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

# ----------------------------
# Initialize Chroma DB
# ----------------------------
chroma_client = chromadb.PersistentClient(path=DB_DIR)

collection = chroma_client.get_or_create_collection(
    name="course_rag",
    metadata={"hnsw:space": "cosine"}   # cosine similarity
)

# ----------------------------
# Process each text file
# ----------------------------

total_chunks = 0

for filename in os.listdir(TEXT_DIR):
    if not filename.endswith(".txt"):
        continue

    filepath = os.path.join(TEXT_DIR, filename)
    
    print(f"\nProcessing {filename} ...")

    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()

    # Split into chunks
    chunks = text_splitter.split_text(raw_text)
    print(f" → {len(chunks)} chunks created")

    # Generate embeddings
    embeddings = embedder.encode(chunks).tolist()

    # Create unique IDs for each chunk
    ids = [f"{filename}-{i}" for i in range(len(chunks))]

    # Insert into Chroma collection
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=chunks,
        metadatas=[{"source": filename}] * len(chunks)
    )

    total_chunks += len(chunks)

print("\n-------------------------------------------")
print("✅ Vector database built successfully!")
print(f"Total chunks stored: {total_chunks}")
print(f"Saved in: {DB_DIR}")
print("-------------------------------------------")


Loading embedding model...

Processing 1_DL_Setting_the_Scene.txt ...
 → 13 chunks created

Processing 2_DL_Adv_Deep_Learning.txt ...
 → 18 chunks created

Processing 3_DL_ComputerVision_Classification.txt ...
 → 37 chunks created

Processing 4_DL_CV_Object_Detection.txt ...
 → 48 chunks created

-------------------------------------------
✅ Vector database built successfully!
Total chunks stored: 116
Saved in: ../vectorstore
-------------------------------------------
