In [1]:
import pandas as pd

# This is the URL for the raw CSV data
url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"

# Pandas can read directly from a compressed URL
print("Loading dataset... this may take a minute.")
try:
    df = pd.read_csv(url, compression='zip')
    print("Dataset loaded successfully!")

    # Display the first 5 rows
    print(df.head())

    # Display the column names
    print("\nColumn Names:")
    print(df.columns)

except Exception as e:
    print(f"Error loading dataset: {e}")

Loading dataset... this may take a minute.
Dataset loaded successfully!
  Date received                                            Product  \
0    2020-07-06  Credit reporting, credit repair services, or o...   
1    2025-10-14  Credit reporting or other personal consumer re...   
2    2025-10-10  Credit reporting or other personal consumer re...   
3    2025-10-15  Credit reporting or other personal consumer re...   
4    2025-10-17  Credit reporting or other personal consumer re...   

        Sub-product                                 Issue  \
0  Credit reporting  Incorrect information on your report   
1  Credit reporting  Incorrect information on your report   
2  Credit reporting  Incorrect information on your report   
3  Credit reporting  Incorrect information on your report   
4  Credit reporting  Incorrect information on your report   

                                           Sub-issue  \
0                Information belongs to someone else   
1  Information is missing th

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Drop rows where 'Consumer complaint narrative' is missing
print("Original data shape:", df.shape)
df_cleaned = df.dropna(subset=['Consumer complaint narrative'])
print("Shape after dropping empty complaints:", df_cleaned.shape)

# 2. Define the 4 target categories from the PDF
target_categories = [
    "Credit reporting, credit repair services, or other personal consumer reports",
    "Debt collection",
    "Consumer Loan",
    "Mortgage"
]

# 3. Filter the DataFrame to only include these categories
df_filtered = df_cleaned[df_cleaned['Product'].isin(target_categories)].copy()
print("Shape after filtering for 4 categories:", df_filtered.shape)

# 4. Create our X and y
X = df_filtered['Consumer complaint narrative']
y_text = df_filtered['Product']

# 5. Encode the text labels (y) into numbers
encoder = LabelEncoder()
y = encoder.fit_transform(y_text)

# --- Verification ---
print("\nData is ready for processing.")
print("Total samples:", len(y))

# Show the mapping of text labels to numbers (from the PDF)
# Note: The order (0, 1, 2, 3) might differ based on the encoder, which is fine.
print("\nCategory to Number Mapping:")
for i, class_name in enumerate(encoder.classes_):
    print(f"'{class_name}': {i}")

Original data shape: (11535877, 18)
Shape after dropping empty complaints: (3416745, 18)
Shape after filtering for 4 categories: (1323496, 18)

Data is ready for processing.
Total samples: 1323496

Category to Number Mapping:
'Consumer Loan': 0
'Credit reporting, credit repair services, or other personal consumer reports': 1
'Debt collection': 2
'Mortgage': 3


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

print("Starting text pre-processing and data split...")

# 1. Define a pre-processing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove numbers and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 2. Apply the pre-processing to our text data (X)
# This might take a moment
X_preprocessed = X.apply(preprocess_text)
print("Text pre-processing complete.")

# 3. Split the data into training and testing sets
# We use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed,
    y,
    test_size=0.2,
    random_state=42,  # Ensures we get the same split every time
    stratify=y         # Keeps the category balance in both sets
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# 4. Set up the TF-IDF Vectorizer
# This will convert our text into a matrix of numbers
# 'stop_words='english'' automatically removes common English words
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

print("\nStarting TF-IDF vectorization (Feature Engineering)...")

# Fit on training data and transform it
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data (using the vocab from training)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Feature engineering complete.")
print(f"Shape of TF-IDF matrix for training data: {X_train_tfidf.shape}")

Starting text pre-processing and data split...
Text pre-processing complete.
Training samples: 1058796
Testing samples: 264700

Starting TF-IDF vectorization (Feature Engineering)...
Feature engineering complete.
Shape of TF-IDF matrix for training data: (1058796, 10000)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

print("--- Step 4: Model Selection & Training ---")

# --- Model 1: Logistic Regression ---
print("\nTraining Logistic Regression model...")
# 'multi_class='ovr'' means 'one-vs-rest'
# 'solver='liblinear'' is a good default for this type of problem
lr_model = LogisticRegression(multi_class='ovr', solver='liblinear', random_state=42)
lr_model.fit(X_train_tfidf, y_train)
print("Logistic Regression training complete.")

# --- Model 2: Multinomial Naive Bayes ---
print("\nTraining Naive Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
print("Naive Bayes training complete.")

print("\n--- Step 5: Model Comparison & Evaluation ---")

# Get the category names from our encoder
category_names = encoder.classes_

# --- Evaluate Logistic Regression ---
print("\n--- Logistic Regression Evaluation ---")
y_pred_lr = lr_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=category_names))

# --- Evaluate Naive Bayes ---
print("\n--- Naive Bayes Evaluation ---")
y_pred_nb = nb_model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb, target_names=category_names))

--- Step 4: Model Selection & Training ---

Training Logistic Regression model...




Logistic Regression training complete.

Training Naive Bayes model...
Naive Bayes training complete.

--- Step 5: Model Comparison & Evaluation ---

--- Logistic Regression Evaluation ---
Accuracy: 0.9091

Classification Report:
                                                                              precision    recall  f1-score   support

                                                               Consumer Loan       0.73      0.30      0.42      1892
Credit reporting, credit repair services, or other personal consumer reports       0.92      0.95      0.93    161456
                                                             Debt collection       0.88      0.84      0.86     74380
                                                                    Mortgage       0.92      0.93      0.93     26972

                                                                    accuracy                           0.91    264700
                                                             

In [5]:
import numpy as np

print("--- Step 6: Prediction ---")

# Choose the best model (change 'lr_model' to 'nb_model' if Naive Bayes was better)
best_model = lr_model

# 1. Create new, example complaint texts
new_complaints = [
    "I checked my credit report and there is an account that does not belong to me!",
    "A company keeps calling my cell phone trying to collect a debt that I already paid off.",
    "My application for a car loan was denied, and I don't know why.",
    "I am having an issue with my mortgage escrow account, the payment is wrong."
]

print(f"Predicting categories for {len(new_complaints)} new complaints...\n")

# 2. Pre-process the new text (using the same function as before)
processed_complaints = [preprocess_text(text) for text in new_complaints]

# 3. Vectorize the new text (using the same TF-IDF vectorizer)
new_complaints_tfidf = tfidf_vectorizer.transform(processed_complaints)

# 4. Predict using the best model
predictions = best_model.predict(new_complaints_tfidf)

# 5. Decode the number predictions back into text labels
predicted_categories = encoder.inverse_transform(predictions)

# --- Display the results ---
for i in range(len(new_complaints)):
    print(f"Complaint: \"{new_complaints[i]}\"")
    print(f"--> Predicted Category: {predicted_categories[i]}\n")

--- Step 6: Prediction ---
Predicting categories for 4 new complaints...

Complaint: "I checked my credit report and there is an account that does not belong to me!"
--> Predicted Category: Credit reporting, credit repair services, or other personal consumer reports

Complaint: "A company keeps calling my cell phone trying to collect a debt that I already paid off."
--> Predicted Category: Debt collection

Complaint: "My application for a car loan was denied, and I don't know why."
--> Predicted Category: Credit reporting, credit repair services, or other personal consumer reports

Complaint: "I am having an issue with my mortgage escrow account, the payment is wrong."
--> Predicted Category: Mortgage

