In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

# Load data
data = pd.read_csv('../data/filtered_complaints.csv')

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)
chunks = splitter.create_documents(data['clean_text'].tolist())  # Define chunks here
print(f"Created {len(chunks)} chunks")

Created 825338 chunks


In [5]:

%pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but effective

# Embed chunks
chunk_texts = [chunk.page_content for chunk in chunks]
embeddings = embedder.encode(chunk_texts, show_progress_bar=True)

print(f"Embeddings shape: {embeddings.shape}")  # (num_chunks, 384)


Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 25792/25792 [34:30<00:00, 12.46it/s]  


Embeddings shape: (825338, 384)


In [6]:
import chromadb
import numpy as np
from tqdm import tqdm
import shutil
import time
import os
import gc

# Verify variables exist
assert all(var in globals() for var in ['chunks', 'embeddings']), "Run Cells 1 & 2 first!"

# --- 1. Enhanced Nuclear Reset Option ---
def reset_chroma_store():
    """Completely wipe and recreate the vector store with thorough cleanup"""
    try:
        # Step 1: Close and delete all existing Chroma clients
        if 'client' in globals():
            try:
                client.reset()  # Reset the client
                del globals()['client']  # Remove from globals
            except Exception as e:
                print(f"Error closing client: {e}")
        
        # Step 2: Force garbage collection to release memory
        gc.collect()

        # Step 3: Kill ChromaDB server processes (Windows-specific)
        os.system("taskkill /f /im chromadb-server.exe 2> nul")
        time.sleep(5)  # Increased wait time for process termination

        # Step 4: Remove directory with retries
        store_path = "../vector_store"
        if os.path.exists(store_path):
            for attempt in range(5):
                try:
                    shutil.rmtree(store_path)
                    print(f"Successfully removed {store_path}")
                    break
                except PermissionError as pe:
                    print(f"PermissionError on attempt {attempt + 1}: {pe}. Retrying...")
                    time.sleep(3)
                except Exception as e:
                    print(f"Error removing directory on attempt {attempt + 1}: {e}")
                    time.sleep(3)
            else:
                raise Exception(f"Failed to remove {store_path} after 5 attempts")

        # Step 5: Create fresh directory
        os.makedirs(store_path, exist_ok=True)
        print(f"Created fresh directory at {store_path}")

    except Exception as e:
        print(f"Reset error: {e}")
        raise

# --- 2. Force Fresh Start ---
try:
    reset_chroma_store()
except Exception as e:
    print(f"Reset failed: {e}")
    raise

# --- 3. Initialize with Clean Settings ---
try:
    # Use absolute path to avoid relative path issues
    store_path = os.path.abspath("../vector_store")
    print(f"Initializing Chroma client with path: {store_path}")
    
    client = chromadb.PersistentClient(
        path=store_path,
        settings=chromadb.Settings(
            allow_reset=True,
            is_persistent=True,
            persist_directory=store_path,
            anonymized_telemetry=False  # Explicitly disable telemetry to avoid settings mismatch
        )
    )
except Exception as e:
    print(f"Client initialization failed: {e}")
    print("Attempting reset and retry...")
    reset_chroma_store()
    client = chromadb.PersistentClient(
        path=store_path,
        settings=chromadb.Settings(
            allow_reset=True,
            is_persistent=True,
            persist_directory=store_path,
            anonymized_telemetry=False
        )
    )

# --- 4. Create Collection ---
try:
    # Explicitly delete any existing collection
    try:
        client.delete_collection("complaints")
        print("Deleted existing 'complaints' collection")
    except:
        pass

    collection = client.get_or_create_collection(
        name="complaints",
        metadata={"hnsw:space": "cosine"}
    )
    print("Successfully created or retrieved 'complaints' collection")
except Exception as e:
    print(f"Collection creation failed: {e}")
    reset_chroma_store()
    collection = client.create_collection(
        name="complaints",
        metadata={"hnsw:space": "cosine"}
    )

# --- 5. Batch Insert with Error Handling ---
batch_size = 1000  # Reduced for stability
successful_batches = 0

with tqdm(total=len(chunks)) as pbar:
    for i in range(0, len(chunks), batch_size):
        try:
            batch_ids = [str(j) for j in range(i, min(i + batch_size, len(chunks)))]
            batch_texts = [chunks[j].page_content for j in range(i, min(i + batch_size, len(chunks)))]
            batch_embeddings = embeddings[i:i + batch_size].tolist()
            
            collection.add(
                ids=batch_ids,
                documents=batch_texts,
                embeddings=batch_embeddings
            )
            successful_batches += 1
            pbar.update(len(batch_ids))
        except Exception as e:
            print(f"\nBatch {i//batch_size} failed: {str(e)[:200]}...")
            time.sleep(5)
            continue

print(f"✅ Completed {successful_batches} batches. Vector store ready!")

Created fresh directory at ../vector_store
Initializing Chroma client with path: c:\Users\Simbo\Desktop\week6\week6\vector_store
Successfully created or retrieved 'complaints' collection


100%|██████████| 825338/825338 [27:01<00:00, 509.13it/s] 

✅ Completed 826 batches. Vector store ready!



