In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load model/vectorizer if saved
from joblib import load
from sklearn.metrics import classification_report

# Optional: if you want to reuse your keyword cleaner
def clean_keywords(text):
    keywords = [
        "credit union", "union", "payday", "loanmart", "loan mart", "lending", 
        "cashnet", "advance america", "quick cash", "title loan", "speedy cash",
        "tribal loan", "easy finance", "short-term loan", "bad credit loan",
        "instant cash", "get money", "fast loan", "borrow instantly"
    ]
    for word in keywords:
        text = text.lower().replace(word, "")
    return text


In [None]:
# If you saved them earlier
model = load("../best_model.joblib")
vectorizer = load("../tfidf_vectorizer.joblib")


In [None]:
def evaluate_on_cfpb(cfpb_path, trained_model, vectorizer):
    print("Loading CFPB data...")
    cfpb_df = pd.read_csv(cfpb_path, low_memory=False)

    if 'Consumer complaint narrative' not in cfpb_df.columns or 'Product' not in cfpb_df.columns:
        raise ValueError("Expected columns 'Consumer complaint narrative' and 'product' not found.")

    # Filter for loans
    cfpb_df = cfpb_df[['Consumer complaint narrative', 'Product']].dropna()
    cfpb_df = cfpb_df.rename(columns={'Consumer complaint narrative': 'text'})
    loan_df = cfpb_df[cfpb_df['Product'].str.contains("loan", case=False)]

    print(f"Total loan-related complaints: {len(loan_df)}")

    # Clean and vectorize
    loan_df['text_clean'] = loan_df['text'].apply(clean_keywords)
    loan_df = loan_df[loan_df['text_clean'].str.strip() != ""]
    X_vec = vectorizer.transform(loan_df['text_clean'])

    # Predict
    preds = trained_model.predict(X_vec)
    loan_df['predicted_label'] = preds

    print("\nPrediction Label Distribution:")
    print(loan_df['predicted_label'].value_counts())

    return loan_df[['text', 'predicted_label']]


In [None]:
cfpb_results = evaluate_on_cfpb(
    cfpb_path="../data/raw/cfpb_complaints.csv",
    trained_model=model,
    vectorizer=vectorizer
)


In [None]:
cfpb_results.sample(5)

cfpb_results[cfpb_results['predicted_label'] == 'predatory'].sample(5)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
sns.countplot(data=cfpb_results, x='predicted_label', palette='Set2')
plt.title("CFPB Complaint Predictions by Model")
plt.xlabel("Predicted Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
