In [11]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive')
print("Google Drive mounted successfully!")

BASE_DIR = Path("/content/drive/MyDrive/Colab_Notebooks/")
# Configuration
DATA_PATH = BASE_DIR /"filtered_complaints.csv"
CHROMA_PATH = BASE_DIR / "vector_store"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
SAMPLE_SIZE = 15000  # Target size for development [cite: 402]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!


In [14]:
# 1. Load Cleaned Data
print("Loading processed data...")
df = pd.read_csv(DATA_PATH)
print(f"Full Dataset Shape: {df.shape}")

# 2. Stratified Sampling
# We group by 'Mapped_Product' and sample proportionally
print(f"Creating a stratified sample of {SAMPLE_SIZE} complaints...")

# Calculate the fraction needed to get ~10k rows
frac = SAMPLE_SIZE / len(df)

# Sample
df_sample = df.groupby('canonical_product', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))

print(f"Sampled Dataset Shape: {df_sample.shape}")
print("\nSample Distribution:")
print(df_sample['canonical_product'].value_counts())

Loading processed data...
Full Dataset Shape: (1752720, 20)
Creating a stratified sample of 15000 complaints...
Sampled Dataset Shape: (15001, 20)

Sample Distribution:
canonical_product
Savings account              9069
Buy Now, Pay Later (BNPL)    3993
Credit card                  1687
Personal loan                 239
Money transfers                13
Name: count, dtype: int64


  df_sample = df.groupby('canonical_product', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))


In [16]:
# Initialize Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""] # Try to split at natural pauses
)

chunks = []
metadatas = []
ids = []

print("Chunking narratives...")

for index, row in df_sample.iterrows():
    narrative = row['cleaned_narrative']

    # Skip if narrative is too short to matter
    if len(str(narrative)) < 50:
        continue

    # Split text
    splits = text_splitter.split_text(str(narrative))

    for i, split in enumerate(splits):
        # Store Chunk
        chunks.append(split)

        # Store Metadata (Crucial for filtering later!) [cite: 414]
        metadatas.append({
            "complaint_id": str(row.get('Complaint ID', f"id_{index}")), # Fallback if ID missing
            "product": row['canonical_product'],
            "original_index": index,
            "chunk_index": i
        })

        # Create a unique ID for Chroma
        ids.append(f"{index}_{i}")

print(f"Created {len(chunks)} chunks from {len(df_sample)} complaints.")
print(f"Average chunks per complaint: {len(chunks)/len(df_sample):.2f}")

Chunking narratives...
Created 39193 chunks from 15001 complaints.
Average chunks per complaint: 2.61


In [19]:
print("Initializing ChromaDB...")
# Create persistent client
client = chromadb.PersistentClient(path=CHROMA_PATH)

# Setup Embedding Function (using sentence-transformers) [cite: 409]
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL_NAME
)

# Create (or get) the collection
collection = client.get_or_create_collection(
    name="complaint_vectors",
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"} # Use cosine similarity for text search
)

# Add data in batches (Chroma handles batching, but good to be explicit for large data)
BATCH_SIZE = 5000
total_batches = (len(chunks) // BATCH_SIZE) + 1

print(f"Indexing {len(chunks)} chunks into ChromaDB...")

for i in range(0, len(chunks), BATCH_SIZE):
    batch_end = min(i + BATCH_SIZE, len(chunks))

    print(f"Processing batch {i} to {batch_end}...")

    collection.add(
        documents=chunks[i:batch_end],
        metadatas=metadatas[i:batch_end],
        ids=ids[i:batch_end]
    )

print("✅ Success! Data indexed in ChromaDB.")
print(f"Collection count: {collection.count()}")

Initializing ChromaDB...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexing 39193 chunks into ChromaDB...
Processing batch 0 to 5000...
Processing batch 5000 to 10000...
Processing batch 10000 to 15000...
Processing batch 15000 to 20000...
Processing batch 20000 to 25000...
Processing batch 25000 to 30000...
Processing batch 30000 to 35000...
Processing batch 35000 to 39193...
✅ Success! Data indexed in ChromaDB.
Collection count: 39193
