In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

# Load data
data = pd.read_csv('../data/filtered_complaints.csv')

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)
chunks = splitter.create_documents(data['clean_text'].tolist())  # Define chunks here
print(f"Created {len(chunks)} chunks")

Created 825338 chunks


In [9]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but effective

# Embed chunks
chunk_texts = [chunk.page_content for chunk in chunks]
embeddings = embedder.encode(chunk_texts, show_progress_bar=True)

print(f"Embeddings shape: {embeddings.shape}")  # (num_chunks, 384)


Batches: 100%|██████████| 25792/25792 [35:01<00:00, 12.27it/s]  


Embeddings shape: (825338, 384)


In [10]:
import chromadb
import numpy as np
from tqdm import tqdm
import shutil
import time
import os
import gc

# Verify variables exist
assert all(var in globals() for var in ['chunks', 'embeddings']), "Run Cells 1 & 2 first!"

# --- 1. Enhanced Nuclear Reset Option ---
def reset_chroma_store():
    """Completely wipe and recreate the vector store with thorough cleanup"""
    try:
        # Step 1: Close and delete all existing Chroma clients
        if 'client' in globals():
            try:
                client.reset()  # Reset the client
                del globals()['client']  # Remove from globals
            except Exception as e:
                print(f"Error closing client: {e}")
        
        # Step 2: Force garbage collection to release memory
        gc.collect()

        # Step 3: Kill ChromaDB server processes (Windows-specific)
        os.system("taskkill /f /im chromadb-server.exe 2> nul")
        time.sleep(5)  # Increased wait time for process termination

        # Step 4: Remove directory with retries
        store_path = "../vector_store"
        if os.path.exists(store_path):
            for attempt in range(5):
                try:
                    shutil.rmtree(store_path)
                    print(f"Successfully removed {store_path}")
                    break
                except PermissionError as pe:
                    print(f"PermissionError on attempt {attempt + 1}: {pe}. Retrying...")
                    time.sleep(3)
                except Exception as e:
                    print(f"Error removing directory on attempt {attempt + 1}: {e}")
                    time.sleep(3)
            else:
                raise Exception(f"Failed to remove {store_path} after 5 attempts")

        # Step 5: Create fresh directory
        os.makedirs(store_path, exist_ok=True)
        print(f"Created fresh directory at {store_path}")

    except Exception as e:
        print(f"Reset error: {e}")
        raise

# --- 2. Force Fresh Start ---
try:
    reset_chroma_store()
except Exception as e:
    print(f"Reset failed: {e}")
    raise

# --- 3. Initialize with Clean Settings ---
try:
    # Use absolute path to avoid relative path issues
    store_path = os.path.abspath("../vector_store")
    print(f"Initializing Chroma client with path: {store_path}")
    
    client = chromadb.PersistentClient(
        path=store_path,
        settings=chromadb.Settings(
            allow_reset=True,
            is_persistent=True,
            persist_directory=store_path,
            anonymized_telemetry=False  # Explicitly disable telemetry to avoid settings mismatch
        )
    )
except Exception as e:
    print(f"Client initialization failed: {e}")
    print("Attempting reset and retry...")
    reset_chroma_store()
    client = chromadb.PersistentClient(
        path=store_path,
        settings=chromadb.Settings(
            allow_reset=True,
            is_persistent=True,
            persist_directory=store_path,
            anonymized_telemetry=False
        )
    )

# --- 4. Create Collection ---
try:
    # Explicitly delete any existing collection
    try:
        client.delete_collection("complaints")
        print("Deleted existing 'complaints' collection")
    except:
        pass

    collection = client.get_or_create_collection(
        name="complaints",
        metadata={"hnsw:space": "cosine"}
    )
    print("Successfully created or retrieved 'complaints' collection")
except Exception as e:
    print(f"Collection creation failed: {e}")
    reset_chroma_store()
    collection = client.create_collection(
        name="complaints",
        metadata={"hnsw:space": "cosine"}
    )

# --- 5. Batch Insert with Error Handling ---
batch_size = 1000  # Reduced for stability
successful_batches = 0

with tqdm(total=len(chunks)) as pbar:
    for i in range(0, len(chunks), batch_size):
        try:
            batch_ids = [str(j) for j in range(i, min(i + batch_size, len(chunks)))]
            batch_texts = [chunks[j].page_content for j in range(i, min(i + batch_size, len(chunks)))]
            batch_embeddings = embeddings[i:i + batch_size].tolist()
            
            collection.add(
                ids=batch_ids,
                documents=batch_texts,
                embeddings=batch_embeddings
            )
            successful_batches += 1
            pbar.update(len(batch_ids))
        except Exception as e:
            print(f"\nBatch {i//batch_size} failed: {str(e)[:200]}...")
            time.sleep(5)
            continue

print(f"✅ Completed {successful_batches} batches. Vector store ready!")

Created fresh directory at ../vector_store
Initializing Chroma client with path: c:\Users\Simbo\Desktop\week6\week6\vector_store
Successfully created or retrieved 'complaints' collection


100%|██████████| 825338/825338 [35:58<00:00, 382.33it/s]

✅ Completed 826 batches. Vector store ready!





In [18]:
import chromadb
import os
import psutil
from chromadb.utils import embedding_functions
import time

# --- 1. Nuclear Cleanup ---
def kill_chroma_processes():
    """Force kill all Chroma-related processes"""
    for proc in psutil.process_iter(['name']):
        if 'chroma' in proc.info['name'].lower():
            try:
                proc.kill()
            except:
                pass
    time.sleep(2)  # Wait for cleanup

kill_chroma_processes()

# --- 2. Initialize with EXACT Task 2 Settings ---
store_path = os.path.abspath("../vector_store")
client = chromadb.PersistentClient(
    path=store_path,
    settings=chromadb.Settings(
        allow_reset=True,
        anonymized_telemetry=False,
        is_persistent=True
    )
)

# --- 3. Load Collection (No Embedding Function) ---
try:
    collection = client.get_collection("complaints")
    print(f"✅ Loaded collection with {collection.count()} entries")
except Exception as e:
    print(f"❌ Collection error: {e}")
    print("Available collections:", [col.name for col in client.list_collections()])
    raise

# --- 4. Query Using Manual Embeddings ---
def query_complaints(search_text: str, n_results=5):
    try:
        # 1. Verify collection is properly loaded
        print(f"\nℹ️ Collection contains {collection.count()} entries")
        
        # 2. Initialize embedder (MUST match Task 2)
        embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
            "all-MiniLM-L6-v2"
        )
        
        # 3. Generate query embedding
        query_embedding = embedder([search_text])
        print(f"ℹ️ Query embedding shape: {len(query_embedding[0])} dimensions")
        
        # 4. Basic query without filters first
        results = collection.query(
            query_embeddings=query_embedding,
            n_results=n_results,
            include=["documents", "distances"]
        )
        
        # 5. Verify results
        if not results['documents'][0]:
            print("⚠️ No results found. Trying fallback methods...")
            
            # Fallback 1: Try without embeddings
            results = collection.query(
                query_texts=[search_text],
                n_results=n_results
            )
            
            # Fallback 2: Show random samples if still empty
            if not results['documents'][0]:
                samples = collection.peek()['documents']
                print("\nℹ️ Sample documents in collection:")
                for i, doc in enumerate(samples[:3]):
                    print(f"{i+1}. {doc[:100]}...")
                return None
        
        # 6. Display results
        print(f"\n🔍 Top {n_results} matches for '{search_text}':")
        for idx, (doc, dist) in enumerate(zip(results["documents"][0], results["distances"][0])):
            print(f"\n#{idx+1} (Score: {1-dist:.3f}):")
            print(doc[:500] + ("..." if len(doc) > 500 else ""))
            
        return results
        
    except Exception as e:
        print(f"❌ Query failed: {str(e)}")
        return None

# # Test query
# query_complaints("late delivery")

# Test with different queries
query_complaints("late delivery complaint")
query_complaints("package never arrived")
query_complaints("refund not processed")

✅ Loaded collection with 825338 entries

ℹ️ Collection contains 825338 entries
ℹ️ Query embedding shape: 384 dimensions

🔍 Top 5 matches for 'late delivery complaint':

#1 (Score: 0.526):
late fee

#2 (Score: 0.526):
late fee

#3 (Score: 0.526):
late fee

#4 (Score: 0.526):
late fee

#5 (Score: 0.526):
late fee

ℹ️ Collection contains 825338 entries
ℹ️ Query embedding shape: 384 dimensions

🔍 Top 5 matches for 'package never arrived':

#1 (Score: 0.369):
problem with customer service

#2 (Score: 0.369):
problem with customer service

#3 (Score: 0.369):
problem with customer service

#4 (Score: 0.226):
other service problem

#5 (Score: 0.226):
other service problem

ℹ️ Collection contains 825338 entries
ℹ️ Query embedding shape: 384 dimensions

🔍 Top 5 matches for 'refund not processed':

#1 (Score: 0.469):
charged fees or interest you didnt expect

#2 (Score: 0.469):
charged fees or interest you didnt expect

#3 (Score: 0.469):
charged fees or interest you didnt expect

#4 (Score: 0.46

{'ids': [['715264', '715327', '741908', '742099', '742973']],
 'embeddings': None,
 'documents': [['charged fees or interest you didnt expect',
   'charged fees or interest you didnt expect',
   'charged fees or interest you didnt expect',
   'charged fees or interest you didnt expect',
   'charged fees or interest you didnt expect']],
 'uris': None,
 'included': ['documents', 'distances'],
 'data': None,
 'metadatas': None,
 'distances': [[0.5307751297950745,
   0.5307751297950745,
   0.5307751297950745,
   0.5307751297950745,
   0.5307751297950745]]}

In [1]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from tqdm.auto import tqdm
import os
import shutil

# 1. Load data with validation
print("➡️ Loading data...")
data = pd.read_csv('../data/filtered_complaints.csv')
print(f"Raw CSV contains {len(data)} rows")

# Basic data validation
print("\n🔍 Data sample (first 3 rows):")
print(data[['clean_text']].head(3).to_string())

# 2. Extract texts with null checking
texts = data['clean_text'].dropna().astype(str).tolist()
print(f"\nAfter removing empty texts: {len(texts)}")

# 3. Minimal deduplication (temporarily disabled)
print("\n🚀 Testing without deduplication...")
sample_texts = texts[:1000]  # Start with smaller sample for debugging
print(f"First 3 sample texts:\n1. {sample_texts[0]}\n2. {sample_texts[1]}\n3. {sample_texts[2]}")

# 4. Chunking test
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
)

test_chunks = splitter.create_documents(sample_texts)
print(f"\nGenerated {len(test_chunks)} chunks from first 1000 texts")
print("Sample chunk:", test_chunks[0].page_content[:100] + "...")

# 5. Full processing (only proceed if test looks good)
if len(test_chunks) > 50:  # Sanity check
    print("\n✨ Proceeding with full processing...")
    chunks = []
    for text in tqdm(texts, desc="Chunking"):  # Process all texts
        chunks.extend(splitter.create_documents([text]))
    
    print(f"\nTotal chunks generated: {len(chunks)}")
    
    # Vector store creation
    store_path = "../vector_store"
    if os.path.exists(store_path):
        shutil.rmtree(store_path)
    
    client = chromadb.PersistentClient(path=store_path)
    embedder = embedding_functions.SentenceTransformerEmbeddingFunction("all-MiniLM-L6-v2")
    
    collection = client.create_collection(
        name="complaints",
        embedding_function=embedder,
        metadata={"hnsw:space": "cosine"}
    )
    
    # Batch insert
    batch_size = 1000
    for i in tqdm(range(0, len(chunks), batch_size), desc="Inserting"):
        batch = chunks[i:i+batch_size]
        collection.add(
            documents=[chunk.page_content for chunk in batch],
            ids=[str(j) for j in range(i, i+len(batch))]
        )
    
    print(f"\n✅ Final collection count: {collection.count()}")
else:
    print("\n❌ Insufficient chunks generated. Please check:")
    print("1. Are texts being properly split?")
    print("2. Is the CSV format correct?")
    print("3. Are texts being truncated too aggressively?")

  from .autonotebook import tqdm as notebook_tqdm


➡️ Loading data...
Raw CSV contains 825338 rows

🔍 Data sample (first 3 rows):
                                                       clean_text
0  problem with a companys investigation into an existing problem
1                                             managing an account
2                                              closing an account

After removing empty texts: 825338

🚀 Testing without deduplication...
First 3 sample texts:
1. problem with a companys investigation into an existing problem
2. managing an account
3. closing an account

Generated 1000 chunks from first 1000 texts
Sample chunk: problem with a companys investigation into an existing problem...

✨ Proceeding with full processing...


Chunking: 100%|██████████| 825338/825338 [00:34<00:00, 24099.23it/s]



Total chunks generated: 825338


Inserting: 100%|██████████| 826/826 [1:03:36<00:00,  4.62s/it]



✅ Final collection count: 825338


In [2]:
print(f"Collection contains {collection.count()} documents")
print("Sample documents:")
print(collection.peek())  # Verify content looks correct

Collection contains 825338 documents
Sample documents:
{'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], 'embeddings': array([[-0.08353022,  0.0883428 ,  0.02909802, ..., -0.01343659,
         0.06147716,  0.03614562],
       [-0.00793637, -0.0282068 , -0.05061107, ...,  0.02778311,
        -0.00288957, -0.05993639],
       [ 0.03076069, -0.00791154, -0.00195179, ...,  0.00922667,
        -0.01095176, -0.08378277],
       ...,
       [-0.06894812, -0.0487364 , -0.02612704, ..., -0.00656051,
        -0.0271311 , -0.00746608],
       [ 0.03076069, -0.00791154, -0.00195179, ...,  0.00922667,
        -0.01095176, -0.08378277],
       [-0.02496498,  0.01398851, -0.01496419, ...,  0.02838119,
         0.05266776, -0.04888583]], shape=(10, 384)), 'documents': ['problem with a companys investigation into an existing problem', 'managing an account', 'closing an account', 'problem with a companys investigation into an existing problem', 'problem with a companys investigation into an ex

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Use these optimized splitting parameters
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,          # Reduced from 300
    chunk_overlap=30,        # Reduced from 50  
    separators=["\n\n", "\n", "(?<=\. )", "(?<=\! )", "(?<=\? )", " ", ""],  # Added regex lookbehinds
    keep_separator=True      # Keep the punctuation
)

# 2. Test with your sample
test_text = "This is a test complaint. It has multiple sentences. Each should become a separate chunk."
test_docs = splitter.create_documents([test_text])

print(f"Generated {len(test_docs)} chunks:")
for i, doc in enumerate(test_docs):
    print(f"\nChunk {i+1}:")
    print(doc.page_content)

Generated 1 chunks:

Chunk 1:
This is a test complaint. It has multiple sentences. Each should become a separate chunk.


  separators=["\n\n", "\n", "(?<=\. )", "(?<=\! )", "(?<=\? )", " ", ""],  # Added regex lookbehinds
  separators=["\n\n", "\n", "(?<=\. )", "(?<=\! )", "(?<=\? )", " ", ""],  # Added regex lookbehinds
  separators=["\n\n", "\n", "(?<=\. )", "(?<=\! )", "(?<=\? )", " ", ""],  # Added regex lookbehinds


In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

# 1. First split into sentences using regex
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split after punctuation
    return [s.strip() for s in sentences if s.strip()]

# 2. Then use this as the first splitting stage
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=30,
    separators=[
        "\n\n",  # Paragraphs first
        "\n",    # Then lines
        "(?<=[.!?]) +",  # Then sentences (notice the space after)
        " ",      # Then words
        ""        # Final fallback
    ],
    keep_separator=True
)

# 3. Test with your sample
test_text = "This is a test complaint. It has multiple sentences! Each should become a separate chunk?"
test_docs = splitter.create_documents([test_text])

print(f"Generated {len(test_docs)} chunks:")
for i, doc in enumerate(test_docs):
    print(f"\nChunk {i+1}:")
    print(doc.page_content)

Generated 1 chunks:

Chunk 1:
This is a test complaint. It has multiple sentences! Each should become a separate chunk?


In [13]:
print(split_into_sentences(test_text))

['This is a test complaint.', 'It has multiple sentences!', 'Each should become a separate chunk?']
