# Task 1: Exploratory Data Analysis and Data Preprocessing

This notebook performs EDA and preprocessing on the CFPB complaint dataset.


In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')
from data_processing import ComplaintDataProcessor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## Load Data

First, we'll load the CFPB complaint dataset. Make sure the file is placed in `data/raw/complaints.csv`.


In [None]:
# Initialize processor
data_path = '../data/raw/complaints.csv'
processor = ComplaintDataProcessor(data_path)

# Load data
df = processor.load_data()
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


## Exploratory Data Analysis

Let's perform comprehensive EDA to understand the data structure, distributions, and quality.


In [None]:
# Perform EDA
eda_stats = processor.perform_eda()


## Filter Data

Filter the dataset to include only the target products and non-empty narratives.


In [None]:
# Filter data
filtered_df = processor.filter_data()
print(f"\nFiltered dataset shape: {filtered_df.shape}")


## Preprocess Data

Clean the text narratives to improve embedding quality.


In [None]:
# Preprocess data
preprocessed_df = processor.preprocess_data()
print(f"\nPreprocessed dataset shape: {preprocessed_df.shape}")


## Save Cleaned Data

Save the cleaned and filtered dataset for use in Task 2.


In [None]:
# Save cleaned data
processor.save_cleaned_data('../data/processed/filtered_complaints.csv')
print("\n✅ Task 1 Complete!")


In [None]:
# Display final summary statistics
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"\nTotal records processed: {eda_stats['total_records']:,}")
print(f"Records after filtering: {len(filtered_df):,}")
print(f"Records after preprocessing: {len(preprocessed_df):,}")
if eda_stats.get('mean_word_count'):
    print(f"Mean narrative length: {eda_stats['mean_word_count']:.2f} words")
    print(f"Median narrative length: {eda_stats['median_word_count']:.2f} words")
if 'Product' in preprocessed_df.columns:
    print("\nProduct distribution:")
    product_dist = preprocessed_df['Product'].value_counts()
    for product, count in product_dist.items():
        print(f"  - {product}: {count:,} ({count/len(preprocessed_df)*100:.1f}%)")
print("\n" + "="*60)


In [None]:
# Save cleaned data
processor.save_cleaned_data('../data/processed/filtered_complaints.csv')
print("\n✅ Task 1 Complete!")


## Summary

**Key Findings:**
- Total records processed: [Fill in from EDA]
- Records after filtering: [Fill in]
- Records after preprocessing: [Fill in]
- Mean narrative length: [Fill in] words
- Product distribution: [Fill in]

**Next Steps:**
- Proceed to Task 2: Text Chunking and Embedding
