In [1]:
# Cell 1: Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text processing
import re
from wordcloud import STOPWORDS

# Set some plot style settings for visualization
sns.set(style="whitegrid")


In [2]:
# Cell 2: Load the Dataset

# Load the raw complaints dataset
file_path = '../data/raw/complaints.csv'  # Update this path if needed

complaints_df = pd.read_csv(file_path)

# Display the first few rows of the dataset
complaints_df.head()


  complaints_df = pd.read_csv(file_path)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,Experian Information Solutions Inc.,FL,32092,,,Web,2025-06-20,In progress,Yes,,14195687
1,2025-06-20,Debt collection,Telecommunications debt,Attempts to collect debt not owed,Debt is not yours,,Company can't verify or dispute the facts in t...,"Eastern Account Systems of Connecticut, Inc.",FL,342XX,,,Web,2025-06-20,Closed with explanation,Yes,,14195688
2,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195689
3,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,Experian Information Solutions Inc.,AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195690
4,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Account status incorrect,,,Experian Information Solutions Inc.,IL,60628,,,Web,2025-06-20,In progress,Yes,,14195692


In [3]:
# Cell 3: Exploratory Data Analysis (EDA)

# 1. Check for missing values
missing_values = complaints_df.isnull().sum()

# 2. Distribution of complaints by product
product_distribution = complaints_df['Product'].value_counts()

# 3. Length of Consumer complaint narrative
complaints_df['narrative_length'] = complaints_df['Consumer complaint narrative'].apply(lambda x: len(str(x).split()))

# Summary of findings
missing_values, product_distribution, complaints_df[['narrative_length']].describe()


(Date received                         0
 Product                               0
 Sub-product                      235295
 Issue                                 6
 Sub-issue                        839522
 Consumer complaint narrative    6629041
 Company public response         4770207
 Company                               0
 State                             54516
 ZIP code                          30228
 Tags                            8981029
 Consumer consent provided?      1649561
 Submitted via                         0
 Date sent to company                  0
 Company response to consumer         20
 Timely response?                      0
 Consumer disputed?              8841498
 Complaint ID                          0
 dtype: int64,
 Product
 Credit reporting or other personal consumer reports                             4834855
 Credit reporting, credit repair services, or other personal consumer reports    2163857
 Debt collection                                            

In [4]:
# Cell 4: Data Cleaning and Preprocessing

# 1. Filter the dataset to include only the 5 specified products
required_products = [
    "Credit card", "Personal loan", "Buy Now, Pay Later (BNPL)", "Savings account", "Money transfers"
]
filtered_df = complaints_df[complaints_df['Product'].isin(required_products)]

# 2. Remove records with empty 'Consumer complaint narrative'
filtered_df = filtered_df.dropna(subset=['Consumer complaint narrative'])

# 3. Normalize text (lowercase and remove special characters)
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove non-alphabetic characters (punctuation, digits, etc.)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_text)

# Show the cleaned data sample
filtered_df[['Product', 'Consumer complaint narrative', 'cleaned_narrative']].head()


Unnamed: 0,Product,Consumer complaint narrative,cleaned_narrative
12237,Credit card,A XXXX XXXX card was opened under my name by a...,a xxxx xxxx card was opened under my name by a...
13280,Credit card,"Dear CFPB, I have a secured credit card with c...",dear cfpb i have a secured credit card with ci...
13506,Credit card,I have a Citi rewards cards. The credit balanc...,i have a citi rewards cards the credit balance...
13955,Credit card,b'I am writing to dispute the following charge...,bi am writing to dispute the following charges...
14249,Credit card,"Although the account had been deemed closed, I...",although the account had been deemed closed i ...


In [6]:
# Cell 5: Save the Cleaned Data

# Save the filtered and cleaned dataset to a new file
output_path = '../data/filtered/filtered_complaints.csv'  # Adjust the path if needed
filtered_df.to_csv(output_path, index=False)

print(f"Cleaned data saved to: {output_path}")


Cleaned data saved to: ../data/filtered/filtered_complaints.csv
