In [1]:
# %% [markdown]
# # Task 2: Text Chunking, Embedding, and Vector Store Indexing
# 
# ## üéØ Objective
# Convert cleaned text narratives into a format suitable for efficient semantic search.
# 
# Since the challenge provides pre-built embeddings, we'll:
# 1. Analyze the pre-built embeddings structure
# 2. Create a small sample for learning purposes
# 3. Document our chunking and embedding strategy
# 

# %%
# Import libraries
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Set style
plt.style.use('default')
sns.set_style("whitegrid")

print("‚úÖ Libraries imported")


‚úÖ Libraries imported


In [2]:
# %% [markdown]
# ## Step 1: Check Available Files

# %%
print("üìÅ Checking available data files...")
data_files = []
for root, dirs, files in os.walk("../data"):
    for file in files:
        if file.endswith(('.csv', '.parquet')):
            full_path = os.path.join(root, file)
            size_mb = os.path.getsize(full_path) / (1024**2)
            data_files.append((file, size_mb, full_path))

print(f"Found {len(data_files)} data files:")
for file, size_mb, path in sorted(data_files):
    print(f"  ‚Ä¢ {file}: {size_mb:.1f} MB")

üìÅ Checking available data files...
Found 4 data files:
  ‚Ä¢ complaint_embeddings.parquet: 2289.7 MB
  ‚Ä¢ complaints.csv: 5762.3 MB
  ‚Ä¢ filtered_complaints.csv: 0.1 MB
  ‚Ä¢ filtered_complaints_sample.csv: 0.1 MB


In [3]:
# ## Step 2: Load Pre-built Embeddings Metadata

# %%
print("\nüì• Loading pre-built embeddings metadata...")

embeddings_path = "../data/raw/complaint_embeddings.parquet"

if os.path.exists(embeddings_path):
    try:
        # Read just metadata columns (fast, no vectors)
        metadata_cols = [
            'complaint_id', 'product_category', 'product', 'issue',
            'sub_issue', 'company', 'state', 'date_received',
            'chunk_index', 'total_chunks'
        ]
        
        # Use pyarrow for fast parquet reading
        df_embeddings = pd.read_parquet(
            embeddings_path,
            columns=metadata_cols,
            engine='pyarrow'
        )
        
        print(f"‚úÖ Metadata loaded successfully!")
        print(f"üìä Total chunks in embeddings: {len(df_embeddings):,}")
        
        # Show sample
        print("\nüëÄ First 3 rows of metadata:")
        display(df_embeddings.head(3))
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("Creating sample data for demonstration...")
        df_embeddings = None
else:
    print("‚ùå Embeddings file not found at:", embeddings_path)
    df_embeddings = None


üì• Loading pre-built embeddings metadata...
‚ùå Error: No match for FieldRef.Name(complaint_id) in id: string
document: string
embedding: list<element: double>
metadata: struct<chunk_index: int64, company: string, complaint_id: string, date_received: string, issue: string, product: string, product_category: string, state: string, sub_issue: string, total_chunks: int64>
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string
Creating sample data for demonstration...


In [4]:
# ## Step 3: Analyze Embeddings Structure

# %%
print("\n" + "="*60)
print("ANALYZING EMBEDDINGS STRUCTURE")
print("="*60)

if df_embeddings is not None:
    # Product distribution
    if 'product_category' in df_embeddings.columns:
        print("üìä Product Category Distribution:")
        product_counts = df_embeddings['product_category'].value_counts()
        
        # Plot
        plt.figure(figsize=(12, 6))
        top_products = product_counts.head(10)
        bars = plt.barh(range(len(top_products)), top_products.values)
        plt.yticks(range(len(top_products)), top_products.index)
        plt.xlabel('Number of Chunks')
        plt.title('Top 10 Product Categories (by chunk count)')
        plt.grid(True, alpha=0.3, axis='x')
        
        # Add labels
        for i, (bar, value) in enumerate(zip(bars, top_products.values)):
            plt.text(value + max(top_products.values)*0.01, 
                    i, 
                    f'{value:,}', 
                    va='center',
                    fontsize=9)
        
        plt.tight_layout()
        plt.show()
        
        # Print counts
        print("\nüìà Product counts:")
        for product, count in product_counts.head(10).items():
            percentage = count / len(df_embeddings) * 100
            print(f"  ‚Ä¢ {product}: {count:,} chunks ({percentage:.1f}%)")
    
    # Chunk statistics
    if 'chunk_index' in df_embeddings.columns and 'total_chunks' in df_embeddings.columns:
        print(f"\nüìù Chunking Statistics:")
        print(f"  ‚Ä¢ Total complaints: {df_embeddings['complaint_id'].nunique():,}")
        print(f"  ‚Ä¢ Total chunks: {len(df_embeddings):,}")
        print(f"  ‚Ä¢ Average chunks per complaint: {df_embeddings['total_chunks'].mean():.2f}")
        print(f"  ‚Ä¢ Max chunks per complaint: {df_embeddings['total_chunks'].max()}")
        print(f"  ‚Ä¢ Min chunks per complaint: {df_embeddings['total_chunks'].min()}")
        
        # Distribution of chunks per complaint
        plt.figure(figsize=(10, 5))
        chunk_dist = df_embeddings['total_chunks'].value_counts().head(20)
        chunk_dist.plot(kind='bar')
        plt.title('Distribution of Chunks per Complaint')
        plt.xlabel('Number of Chunks')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    # Check target products
    target_products = ['Credit Card', 'Personal Loan', 'Savings Account', 'Money transfers']
    print(f"\nüéØ Checking target products:")
    if 'product_category' in df_embeddings.columns:
        for product in target_products:
            # Case-insensitive search
            mask = df_embeddings['product_category'].str.contains(product, case=False, na=False)
            count = mask.sum()
            if count > 0:
                print(f"  ‚Ä¢ {product}: {count:,} chunks found")
            else:
                print(f"  ‚Ä¢ {product}: Not found (checking variations...)")
                # Try different variations
                variations = [product.lower(), product.upper(), product.title()]
                for var in variations:
                    var_mask = df_embeddings['product_category'] == var
                    if var_mask.any():
                        print(f"    Found as '{df_embeddings[var_mask].iloc[0]['product_category']}'")
                        break
else:
    print("‚ö†Ô∏è No embeddings data to analyze")



ANALYZING EMBEDDINGS STRUCTURE
‚ö†Ô∏è No embeddings data to analyze
