In [None]:
# ==========================================================
# PROJECT: Consumer Complaint Text Classification
# COMPANY: Kaiburr
# AUTHOR: Sanathraj S
# ==========================================================

# -------------------------------
# Step 0: Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import requests
import io
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

nltk.download('stopwords')

# -------------------------------
# Step 1: Load Dataset (Automatic)
# -------------------------------
# Fetch data directly from CFPB API (official source)
print("üì• Downloading latest Consumer Complaint dataset...")
url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"
response = requests.get(url)

if response.status_code == 200:
    df = pd.read_csv(io.BytesIO(response.content), compression='zip', low_memory=False)
    print("‚úÖ Dataset successfully downloaded!")
else:
    print("‚ùå Download failed, please check internet connection.")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist()[:10])

# -------------------------------
# Step 2: Select Relevant Columns
# -------------------------------
df = df[['Product', 'Consumer complaint narrative']]
df.dropna(inplace=True)

# -------------------------------
# Step 3: Filter Required Categories
# -------------------------------
category_map = {
    'Credit reporting, repair, or other personal consumer reports': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}

df = df[df['Product'].isin(category_map.keys())]
df['Category'] = df['Product'].map(category_map)
df.reset_index(drop=True, inplace=True)

print("\nCategory Distribution:")
print(df['Product'].value_counts())

# -------------------------------
# Step 4: Exploratory Data Analysis
# -------------------------------
plt.figure(figsize=(8,4))
sns.countplot(y='Product', data=df, order=df['Product'].value_counts().index, palette='viridis')
plt.title("Complaint Count by Category")
plt.xlabel("Count")
plt.ylabel("Category")
plt.show()

# Wordcloud
text = " ".join(df['Consumer complaint narrative'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words in Complaints")
plt.show()

# -------------------------------
# Step 5: Text Cleaning Function
# -------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['Consumer complaint narrative'].apply(clean_text)

# -------------------------------
# Step 6: Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['Category'], test_size=0.2, random_state=42, stratify=df['Category']
)

# -------------------------------
# Step 7: Model Selection and Training
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nüîπ Training {name}...")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=7000, ngram_range=(1,2))),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# -------------------------------
# Step 8: Model Comparison
# -------------------------------
plt.figure(figsize=(7,4))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette='mako')
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()

best_model_name = max(results, key=results.get)
print(f"\nüèÜ Best Model: {best_model_name} ({results[best_model_name]:.2%})")

# -------------------------------
# Step 9: Evaluate Best Model
# -------------------------------
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=7000, ngram_range=(1,2))),
    ('clf', models[best_model_name])
])

final_pipeline.fit(X_train, y_train)
y_pred_final = final_pipeline.predict(X_test)

cm = confusion_matrix(y_test, y_pred_final)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm',
            xticklabels=list(category_map.keys()),
            yticklabels=list(category_map.keys()))
plt.title(f"{best_model_name} Confusion Matrix")
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=75)
plt.show()

# -------------------------------
# Step 10: Prediction on New Complaints
# -------------------------------
reverse_map = {v:k for k,v in category_map.items()}
sample_complaints = [
    "They keep calling me for a debt I already paid last year.",
    "There are incorrect accounts showing on my credit report.",
    "My mortgage application is delayed for no reason.",
    "The consumer loan interest rate they charged is wrong."
]

preds = final_pipeline.predict(sample_complaints)

print("\nüìä Sample Predictions:")
for text, label in zip(sample_complaints, preds):
    print(f"Complaint: {text}\nPredicted Category: {reverse_map[label]}\n")

print("\nüéØ Final Model Ready for Deployment and Submission")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


üì• Downloading latest Consumer Complaint dataset...
