# Complaint Category Classification using Text
This notebook trains a classifier to predict the type of complaint based on customer narrative text.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import joblib
import re
import string


In [None]:
# Upload the dataset if running in Colab
from google.colab import files
uploaded = files.upload()

# Load the CSV
df = pd.read_csv(next(iter(uploaded)))
df.head()



SyntaxError: invalid syntax. Perhaps you forgot a comma? (817927805.py, line 9)

In [12]:
# Drop empty rows and remove extra index column if present
df.dropna(subset=['narrative'], inplace=True)
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

df['product'].value_counts()


Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
credit_reporting,91172
debt_collection,23148
mortgages_and_loans,18990
credit_card,15566
retail_banking,13535


In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS]
    return ' '.join(tokens)

df['clean_text'] = df['narrative'].apply(clean_text)
df[['clean_text', 'product']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,clean_text,product
0,purchase order day shipping amount receive pro...,credit_card
1,forwarded message date tue subject please inve...,credit_card
2,forwarded message cc sent friday pdt subject f...,retail_banking
3,payment history missing credit report speciali...,credit_reporting
4,payment history missing credit report made mis...,credit_reporting


In [18]:
X = df['clean_text']
y = df['product']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
 # Import LogisticRegression here
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform both training and testing data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [25]:
# Dictionary to hold models
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Naive Bayes': MultinomialNB()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    # Use the vectorized training data
    model.fit(X_train_vec, y_train)
    # Use the vectorized testing data for prediction
    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    results[name] = {
        'accuracy': accuracy,
        'report': report,
        'model': model  # Store the trained model
    }

    print(f"{name} Accuracy: {accuracy}")
    print(f"{name} Classification Report:\n{report}")
    print("-" * 30)


Training Logistic Regression...
Logistic Regression Accuracy: 0.8765815965274143
Logistic Regression Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.80      0.78      0.79      3113
   credit_reporting       0.91      0.94      0.92     18235
    debt_collection       0.82      0.74      0.77      4630
mortgages_and_loans       0.86      0.83      0.84      3798
     retail_banking       0.88      0.88      0.88      2707

           accuracy                           0.88     32483
          macro avg       0.85      0.83      0.84     32483
       weighted avg       0.87      0.88      0.88     32483

------------------------------
Training Decision Tree...
Decision Tree Accuracy: 0.8228611889295939
Decision Tree Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.67      0.64      0.66      3113
   credit_reporting       0.89      0.92      0.90     18235
   

In [27]:
# Compare and find the best model
best_model_name = None
best_accuracy = 0

print("Model Comparison:")
for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics['accuracy']:.4f}")
    if metrics['accuracy'] > best_accuracy:
        best_accuracy = metrics['accuracy']
        best_model_name = name

print(f"\nBest Model: {best_model_name} with Accuracy = {best_accuracy:.4f}")

# Dump the best model
best_model = results[best_model_name]['model']
joblib.dump(best_model, f'{best_model_name.replace(" ", "_")}_best_model.pkl')
print()

Model Comparison:
Logistic Regression: Accuracy = 0.8766
Decision Tree: Accuracy = 0.8229
Random Forest: Accuracy = 0.8764
Multinomial Naive Bayes: Accuracy = 0.8150

Best Model: Logistic Regression with Accuracy = 0.8766

