In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display and plot styles
pd.set_option('display.max_columns', 100)
sns.set(style='whitegrid')
%matplotlib inline

# Load the filtered dataset
df = pd.read_csv("../data/amex_credit_card_complaints.csv")

print("Dataset Loaded!")
print("Shape:", df.shape)
df.head()


Dataset Loaded!
Shape: (14456, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2025-07-19,credit card,General-purpose credit card or charge card,Trouble using your card,Can't use card to make purchases,,,american express company,NJ,7748,,,Web,2025-07-19,In progress,Yes,,14759032
1,2023-09-15,credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Problem with rewards from credit card,I am writing to file a formal complaint with t...,,american express company,GA,30319,,Consent provided,Web,2023-09-15,Closed with explanation,Yes,,7555986
2,2025-05-20,credit card,General-purpose credit card or charge card,Closing your account,Company closed your account,American Express Closed my business credit ( c...,,american express company,MA,2478,,Consent provided,Web,2025-05-20,Closed with explanation,Yes,,13615367
3,2024-04-17,credit card,General-purpose credit card or charge card,Getting a credit card,Application denied,"On XXXX XXXX, I applied for an American Expres...",,american express company,FL,33026,,Consent provided,Web,2024-04-17,Closed with explanation,Yes,,8798047
4,2024-03-25,credit card,General-purpose credit card or charge card,Problem with a company's investigation into an...,Their investigation did not fix an error on yo...,I am a victim of identity theft I have two cas...,,american express company,GA,30032,,Consent provided,Web,2024-03-27,Closed with non-monetary relief,Yes,,8624856


In [2]:
# Lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Convert date columns
df['date_received'] = pd.to_datetime(df['date_received'])
df['date_sent_to_company'] = pd.to_datetime(df['date_sent_to_company'])

print("Column names cleaned & date columns parsed.")


Column names cleaned & date columns parsed.


In [3]:
# Check % missing
missing = df.isnull().mean().sort_values(ascending=False)
missing = missing[missing > 0]
print("Columns with missing values:\n", missing)

# Drop rows with no narrative (required for NLP later)
df = df.dropna(subset=['consumer_complaint_narrative'])

# Fill remaining missing with default values
df['consumer_disputed?'] = df['consumer_disputed?'].fillna("Unknown")
df['timely_response?'] = df['timely_response?'].fillna("Unknown")
df['tags'] = df['tags'].fillna("Not Provided")
df['consumer_consent_provided?'] = df['consumer_consent_provided?'].fillna("Not Provided")
df['company_public_response'] = df['company_public_response'].fillna("No response")

print("Missing values handled.")


Columns with missing values:
 company_public_response         1.000000
tags                            0.853210
consumer_complaint_narrative    0.623547
consumer_disputed?              0.538669
sub-issue                       0.461608
sub-product                     0.461331
consumer_consent_provided?      0.356253
state                           0.008301
zip_code                        0.001937
dtype: float64
Missing values handled.


In [4]:
# Drop duplicates (same narrative + issue)
df = df.drop_duplicates(subset=['consumer_complaint_narrative', 'issue'])

# Remove complaints that are too short
df = df[df['consumer_complaint_narrative'].str.len() > 30]

# Optional: reset index
df = df.reset_index(drop=True)

print("Removed duplicates and too-short complaints.")


Removed duplicates and too-short complaints.


In [5]:
print("Number of rows:", df.shape[0])
print("Unique issues:", df['issue'].nunique())
print("Unique submission channels:", df['submitted_via'].unique())
print("Date range:", df['date_received'].min(), "to", df['date_received'].max())


Number of rows: 5409
Unique issues: 44
Unique submission channels: ['Web']
Date range: 2015-03-19 00:00:00 to 2025-06-05 00:00:00


In [6]:
# Save cleaned data
df.to_csv("../data/amex_cleaned_complaints.csv", index=False)
print("Cleaned dataset saved to: ../data/amex_cleaned_complaints.csv")


Cleaned dataset saved to: ../data/amex_cleaned_complaints.csv
