In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('complaints.csv')
print(df.head())

  df = pd.read_csv('complaints.csv')


  Date received                                            Product  \
0    2025-05-23                                    Debt collection   
1    2025-05-23                                    Debt collection   
2    2025-04-18  Credit reporting or other personal consumer re...   
3    2025-06-08                                    Debt collection   
4    2025-06-08  Credit reporting or other personal consumer re...   

        Sub-product                                              Issue  \
0     I do not know                  Attempts to collect debt not owed   
1  Credit card debt                    Written notification about debt   
2  Credit reporting               Incorrect information on your report   
3     I do not know                  Attempts to collect debt not owed   
4  Credit reporting  Problem with a company's investigation into an...   

                                          Sub-issue  \
0                 Debt was result of identity theft   
1  Didn't receive enough

In [3]:
# Get basic information about the dataset: data types, non-null counts, memory usage
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9408224 entries, 0 to 9408223
Data columns (total 18 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
 6   Company public response       object
 7   Company                       object
 8   State                         object
 9   ZIP code                      object
 10  Tags                          object
 11  Consumer consent provided?    object
 12  Submitted via                 object
 13  Date sent to company          object
 14  Company response to consumer  object
 15  Timely response?              object
 16  Consumer disputed?            object
 17  Complaint ID                  int64 
dtypes: int64(1), object(17)
memory usage: 1.3+

In [4]:
# Summary statistics for numerical columns
print(df.describe())

       Complaint ID
count  9.408224e+06
mean   7.963796e+06
std    3.709877e+06
min    1.000000e+00
25%    4.922783e+06
50%    8.326272e+06
75%    1.111860e+07
max    1.396403e+07


In [5]:
# Check for missing values in each column
print(df.isnull().sum())

Date received                         0
Product                               0
Sub-product                      235295
Issue                                 6
Sub-issue                        837100
Consumer complaint narrative    6474487
Company public response         4671635
Company                               0
State                             54276
ZIP code                          30228
Tags                            8784880
Consumer consent provided?      1625136
Submitted via                         0
Date sent to company                  0
Company response to consumer         20
Timely response?                      0
Consumer disputed?              8639924
Complaint ID                          0
dtype: int64


In [6]:
# Check for duplicate rows
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Number of duplicate rows: 0


In [7]:
categorical_cols = ['Product','Sub-product','Issue','Sub-issue','Company complaint narrative','Company public response','Company','State','ZIP code','Tags','Consumer consent provided?','Submitted via','Company response to consumer','Timely response?','Consumer disputed?']

for col in categorical_cols:
    if col in df.columns:
        print(f"Unique values in '{col}':")
        print(df[col].unique())
        print('-' * 40)

Unique values in 'Product':
['Debt collection' 'Credit reporting or other personal consumer reports'
 'Money transfer, virtual currency, or money service' 'Mortgage'
 'Checking or savings account' 'Credit card' 'Vehicle loan or lease'
 'Prepaid card' 'Debt or credit management' 'Student loan'
 'Payday loan, title loan, personal loan, or advance loan'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Consumer Loan' 'Credit card or prepaid card' 'Bank account or service'
 'Payday loan' 'Credit reporting'
 'Payday loan, title loan, or personal loan' 'Money transfers'
 'Other financial service' 'Virtual currency']
----------------------------------------
Unique values in 'Sub-product':
['I do not know' 'Credit card debt' 'Credit reporting' 'Virtual currency'
 'Conventional home mortgage' 'Reverse mortgage' 'FHA mortgage'
 'Other debt' 'Checking account' 'Mobile or digital wallet'
 'Store credit card' 'Loan' 'Foreign currency exchange' 'Auto debt'
 'General-p

In [8]:
# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df = df.dropna(axis=1, thresh=threshold)

In [9]:
print(df.isnull().sum()) 

Date received                         0
Product                               0
Sub-product                      235295
Issue                                 6
Sub-issue                        837100
Company public response         4671635
Company                               0
State                             54276
ZIP code                          30228
Consumer consent provided?      1625136
Submitted via                         0
Date sent to company                  0
Company response to consumer         20
Timely response?                      0
Complaint ID                          0
dtype: int64


In [10]:
# Impute missing values in categorical columns with the most frequent value
from sklearn.impute import SimpleImputer

for col in categorical_cols:
    if col in df.columns:
        imputer = SimpleImputer(strategy='most_frequent')
        # Use .ravel() to flatten the 2D array to 1D 
        df[col] = imputer.fit_transform(df[[col]]).ravel()


In [11]:
print(df.isnull().sum())

Date received                   0
Product                         0
Sub-product                     0
Issue                           0
Sub-issue                       0
Company public response         0
Company                         0
State                           0
ZIP code                        0
Consumer consent provided?      0
Submitted via                   0
Date sent to company            0
Company response to consumer    0
Timely response?                0
Complaint ID                    0
dtype: int64


In [12]:
# Get number of rows and columns
rows, columns = df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}") 

Number of rows: 9408224
Number of columns: 15


In [13]:
product_mapping = {
    'Credit reporting or other personal consumer report': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}
df['Category'] = df['Product'].map(product_mapping)
df = df.dropna(subset=['Category'])
df['Category'] = df['Category'].astype(int)


In [14]:
df['text'] = df['Issue'].fillna('') + ' ' + df['Sub-issue'].fillna('')

In [15]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

df['cleaned_text'] = df['text'].apply(clean_text)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_text = vectorizer.fit_transform(df['cleaned_text'])

# During training, after fitting the vectorizer:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl') # Save the vectorizer


['tfidf_vectorizer.pkl']

In [17]:
print(df.head())

    Date received          Product       Sub-product  \
0      2025-05-23  Debt collection     I do not know   
1      2025-05-23  Debt collection  Credit card debt   
3      2025-06-08  Debt collection     I do not know   
88     2025-06-07  Debt collection     I do not know   
143    2025-05-10  Debt collection  Credit card debt   

                                 Issue  \
0    Attempts to collect debt not owed   
1      Written notification about debt   
3    Attempts to collect debt not owed   
88   Attempts to collect debt not owed   
143              Communication tactics   

                                             Sub-issue  \
0                    Debt was result of identity theft   
1     Didn't receive enough information to verify debt   
3                    Debt was result of identity theft   
88                                   Debt is not yours   
143  You told them to stop contacting you, but they...   

                               Company public response  \
0  

In [18]:
print(df.shape)

(1241493, 18)


In [19]:
from sklearn.model_selection import train_test_split
# Target variable
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.3, random_state=42, stratify=y
)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import warnings
import joblib
warnings.filterwarnings("ignore")

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000,penalty='l1', solver='liblinear', C=0.0001),
    'Multinomial Naive Bayes': MultinomialNB(alpha=1000.0),
    'Linear SVC': LinearSVC(C=0.001, tol=1, max_iter=1000)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

joblib.dump(models['Logistic Regression'], 'logistic_regression_model.pkl')
joblib.dump(models['Multinomial Naive Bayes'], 'multinomial_nb_model.pkl')
joblib.dump(models['Linear SVC'], 'linear_svc_model.pkl')

Logistic Regression trained.
Multinomial Naive Bayes trained.
Linear SVC trained.


['linear_svc_model.pkl']

In [22]:
from sklearn.metrics import classification_report, accuracy_score

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


--- Logistic Regression ---
Accuracy: 0.9398520061860984
              precision    recall  f1-score   support

           1       0.95      1.00      0.97    236563
           2       0.00      0.00      0.00      9472
           3       0.92      0.90      0.91    126413

    accuracy                           0.94    372448
   macro avg       0.62      0.63      0.63    372448
weighted avg       0.92      0.94      0.93    372448

--- Multinomial Naive Bayes ---
Accuracy: 0.9743991107483461
              precision    recall  f1-score   support

           1       1.00      1.00      1.00    236563
           2       0.00      0.00      0.00      9472
           3       0.93      1.00      0.96    126413

    accuracy                           0.97    372448
   macro avg       0.64      0.67      0.65    372448
weighted avg       0.95      0.97      0.96    372448

--- Linear SVC ---
Accuracy: 0.9693541111779362
              precision    recall  f1-score   support

           1     

In [23]:
import pandas as pd

sample_data = pd.DataFrame({
    'Issue': [
        'Attempts to collect debt not owed',
        'Incorrect information on your report',
        'Communication tactics'
    ],
    'Sub-issue': [
        'Debt was result of identity theft',
        'Information belongs to someone else',
        'You told them to stop contacting you, but they continue'
    ],
    'Company public response': [
        'Company has responded to the consumer and the CFPB',
        'Company has responded to the consumer and the CFPB',
        'Company has responded to the consumer and the CFPB'
    ],
    'Company': [
        'CCS Financial Services, Inc.',
        'Experian Information Solutions Inc.',
        'ZWICKER & ASSOCIATES'
    ],
    'State': ['TX', 'TX', 'TX'],
    'ZIP code': ['75006', '76114', '78254'],
    'Consumer consent provided?': [
        'Consent not provided',
        'Consent not provided',
        'Consent not provided'
    ],
    'Submitted via': ['Web', 'Web', 'Web'],
    'Date sent to company': ['2025-05-23', '2025-05-22', '2025-05-22'],
    'Company response to consumer': [
        'Untimely response',
        'In progress',
        'Closed with explanation'
    ],
    'Timely response?': ['No', 'Yes', 'Yes'],
    'Complaint ID': [13688046, 13049884, 13446430],
    'Category': [1, 0, 1],  # Use only for evaluation, not for prediction
    'text': [
        'Attempts to collect debt not owed Debt was result of identity theft',
        'Incorrect information on your report Information belongs to someone else',
        'Communication tactics You told them to stop contacting you, but they continue'
    ],
    'cleaned_text': [
        'attempts to collect debt not owed debt was result of identity theft',
        'incorrect information on your report information belongs to someone else',
        'communication tactics you told them to stop contacting you but they continue'
    ]
})


In [24]:
vectorizer = joblib.load('tfidf_vectorizer.pkl')
# Use your previously fitted vectorizer
X_new = vectorizer.transform(sample_data['cleaned_text'])

log_reg = joblib.load('logistic_regression_model.pkl')
mnb = joblib.load('multinomial_nb_model.pkl')
svc = joblib.load('linear_svc_model.pkl')

# Use your trained models
log_reg_preds = log_reg.predict(X_new)
mnb_preds = mnb.predict(X_new)
svc_preds = svc.predict(X_new)


In [25]:
print("Logistic Regression predictions:", log_reg_preds)
print("MultinomialNB predictions:", mnb_preds)
print("LinearSVC predictions:", svc_preds)


Logistic Regression predictions: [1 3 1]
MultinomialNB predictions: [1 3 1]
LinearSVC predictions: [1 3 1]
