In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")


In [13]:
df = pd.read_csv(r"C:\Users\srive\Downloads\complaints.csv\complaints.csv")


# Display dataset structure
print(df.info())
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Selecting only relevant columns (Product and Consumer Complaint)
df = df[['Product', 'Consumer complaint narrative']].dropna()

# Mapping product categories to predefined labels
category_mapping = {
    'Credit reporting, repair, or other': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}
df = df[df['Product'].isin(category_mapping.keys())]  # Filtering only required categories
df['Category'] = df['Product'].map(category_mapping)

print(df['Category'].value_counts())  # Check class distribution


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8258542 entries, 0 to 8258541
Data columns (total 18 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
 6   Company public response       object
 7   Company                       object
 8   State                         object
 9   ZIP code                      object
 10  Tags                          object
 11  Consumer consent provided?    object
 12  Submitted via                 object
 13  Date sent to company          object
 14  Company response to consumer  object
 15  Timely response?              object
 16  Consumer disputed?            object
 17  Complaint ID                  int64 
dtypes: int64(1), object(17)
memory usage: 1.1+

In [14]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = word_tokenize(text)  # Tokenization
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    return " ".join(text)

df['Processed_Text'] = df['Consumer complaint narrative'].apply(clean_text)

print(df[['Consumer complaint narrative', 'Processed_Text']].head())  # Check processed text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srive\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                          Consumer complaint narrative  \
125  This debt was sold to a third party who has ca...   
219  I am requesting proof that I owe this debt, in...   
352  I do not owe any debt to this company, did not...   
413  I want to stress that I did not give written p...   
534  My dad called me on XX/XX/XXXX saying that he ...   

                                        Processed_Text  
125  debt sold third party called times days hours ...  
219  requesting proof owe debt including account in...  
352                           owe debt company approve  
413  want stress give written permission specific t...  
534  dad called xxxxxxxx saying received text sayin...  


In [16]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Text'], df['Category'], test_size=0.2, random_state=42)

# Convert text into numerical format using TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Initialize models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print("\n")


Model: Naive Bayes
              precision    recall  f1-score   support

           1       0.97      0.97      0.97     62359
           2       0.70      0.29      0.41      1817
           3       0.91      0.96      0.93     25384

    accuracy                           0.95     89560
   macro avg       0.86      0.74      0.77     89560
weighted avg       0.95      0.95      0.95     89560



Model: Logistic Regression
              precision    recall  f1-score   support

           1       0.97      0.98      0.98     62359
           2       0.73      0.52      0.61      1817
           3       0.96      0.96      0.96     25384

    accuracy                           0.97     89560
   macro avg       0.89      0.82      0.85     89560
weighted avg       0.97      0.97      0.97     89560





In [None]:
# Plot model comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.title("Model Comparison")
plt.ylabel("Accuracy Score")
plt.show()


In [None]:
def predict_category(text):
    text_processed = clean_text(text)
    text_vectorized = vectorizer.transform([text_processed])
    prediction = models["Logistic Regression"].predict(text_vectorized)[0]  # Using best model
    return {0: "Credit Reporting", 1: "Debt Collection", 2: "Consumer Loan", 3: "Mortgage"}[prediction]

# Test prediction
sample_complaint = "I am having issues with my mortgage payment process."
print("Predicted Category:", predict_category(sample_complaint))
