In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('punkt')

from pathlib import Path


In [None]:
df = pd.read_csv('../data/raw_complaints.csv')
df.shape, df.columns


In [None]:
df.head()
df.info()
df.describe(include='all')


In [None]:
plt.figure(figsize=(10,6))
df['Product'].value_counts().plot(kind='bar')
plt.title('Distribution of Complaints by Product')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df['NarrativeLength'] = df['Consumer complaint narrative'].fillna("").apply(lambda x: len(x.split()))

plt.figure(figsize=(10,6))
sns.histplot(df['NarrativeLength'], bins=50, kde=True)
plt.title('Distribution of Narrative Word Counts')
plt.xlabel('Word Count')
plt.show()


In [None]:
df['NarrativeLength'] = df['Consumer complaint narrative'].fillna("").apply(lambda x: len(x.split()))

plt.figure(figsize=(10,6))
sns.histplot(df['NarrativeLength'], bins=50, kde=True)
plt.title('Distribution of Narrative Word Counts')
plt.xlabel('Word Count')
plt.show()


In [None]:
n_with = df['Consumer complaint narrative'].notna().sum()
n_without = df['Consumer complaint narrative'].isna().sum()
print(f"With narratives: {n_with}, Without narratives: {n_without}")


In [None]:
target_products = [
    'Credit card', 'Personal loan',
    'Buy Now, Pay Later', 'Savings account',
    'Money transfer, virtual currency, or money service'
]

df_filtered = df[df['Product'].isin(target_products)].copy()


In [None]:
df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].notna()]


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    text = re.sub(r'\bi am writing.*?\.', '', text)  # remove boilerplate intro
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_filtered['Cleaned_Narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)


In [None]:
output_path = Path('../data/filtered_complaints.csv')
df_filtered.to_csv(output_path, index=False)
print(f"Filtered dataset saved to {output_path}")
