In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Load full dataset (adjust path if needed)
df = pd.read_csv('data/raw/complaints.csv')  # Assume columns: complaint_id, product, consumer_complaint_narrative, etc.
print(f"Dataset shape: {df.shape}")
print(df.columns.tolist())
print(df.head())


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Focus on relevant products
relevant_products = ['Credit card', 'Personal loan', 'Savings account', 'Money transfer']  # Adjust based on actual column values; note: dataset may have 'Credit card or prepaid card', etc.

product_dist = df[df['product'].isin(relevant_products)]['product'].value_counts()
plt.figure(figsize=(10, 6))
product_dist.plot(kind='bar')
plt.title('Complaint Distribution by Product')
plt.xlabel('Product')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('notebooks/product_dist.png')
plt.show()

print(product_dist)

In [None]:
# Filter for narratives
df_with_narr = df[df['consumer_complaint_narrative'].notna() & (df['consumer_complaint_narrative'] != '')].copy()
df_with_narr['narrative_length'] = df_with_narr['consumer_complaint_narrative'].str.split().str.len()

plt.figure(figsize=(10, 6))
plt.hist(df_with_narr['narrative_length'], bins=50, edgecolor='black')
plt.title('Distribution of Narrative Lengths (Word Count)')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.axvline(df_with_narr['narrative_length'].median(), color='r', linestyle='--', label='Median')
plt.legend()
plt.tight_layout()
plt.savefig('notebooks/narrative_lengths.png')
plt.show()

print(f"Median length: {df_with_narr['narrative_length'].median()}")
print(f"Complaints with narratives: {len(df_with_narr)}")
print(f"Total complaints: {len(df)}")
print(f"Without narratives: {len(df) - len(df_with_narr)}")

# Identify very short (<10 words) or very long (>500 words)
short_narr = df_with_narr[df_with_narr['narrative_length'] < 10]
long_narr = df_with_narr[df_with_narr['narrative_length'] > 500]
print(f"Very short narratives: {len(short_narr)}")
print(f"Very long narratives: {len(long_narr)}")

In [None]:
# Filter to relevant products and non-empty narratives
filtered_df = df[
    (df['product'].isin(relevant_products)) &
    (df['consumer_complaint_narrative'].notna()) &
    (df['consumer_complaint_narrative'] != '')
].copy()

print(f"Filtered shape: {filtered_df.shape}")
print(filtered_df['product'].value_counts())

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    # Optional: Remove boilerplate (customize based on common phrases)
    boilerplate_patterns = [
        r'i am writing to file a complaint',
        r'this is a complaint about',
        r'please help me with'
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text

filtered_df['cleaned_narrative'] = filtered_df['consumer_complaint_narrative'].apply(clean_text)

# Save
filtered_df.to_csv('data/processed/filtered_complaints.csv', index=False)
print("Saved filtered_complaints.csv")