# Loan Text Analysis 

## Setup

In [87]:
# Standard Imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


# HuggingFace transformers
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

pd.set_option('display.max_colwidth', 300)

## 2: Load Data

In [88]:
# Reddit CSV
reddit_df = pd.read_csv("../data/processed/large_sample_reddit_predatory_loan_posts_cleaned.csv")
reddit_df = reddit_df[['post_text', 'label']]
reddit_df.columns = ['text', 'label']

In [89]:
# Scraped Fair Loans (Older data)
webpages_df = pd.read_csv("../data/processed/fair_loan_texts.csv")
pdf_df = pd.read_csv("../data/processed/fair_loan_pdfs.csv")
webpages_df['label'] = 'non_predatory'
pdf_df['label'] = 'non_predatory'
webpages_df = webpages_df[['text', 'label']]
pdf_df = pdf_df[['text', 'label']]

In [90]:
# Scraped Fair Loans (New)
fair_webpaths = glob.glob("../data/raw/web_scrapes/fair_loans/webpages/*.txt")
fair_texts = [open(f,"r",encoding="utf-8").read() for f in fair_webpaths]
new_fair_df = pd.DataFrame({'text': fair_texts, 'label': 'non_predatory'})

# Scraped Predatory Loans (New)
pred_webpaths = glob.glob("../data/raw/web_scrapes/predatory_loans/webpages/*.txt")
pred_texts = [open(f,"r",encoding="utf-8").read() for f in pred_webpaths]
new_pred_df = pd.DataFrame({'text': pred_texts, 'label': 'predatory'})

In [91]:
## Combine All
full_df = pd.concat([
    reddit_df,
    loan_docs_df,
    webpages_df,
    pdf_df,
    new_fair_df,
    new_pred_df
], axis=0).reset_index(drop=True)

In [92]:
# Final balanced dataset
full_df = full_df.dropna(subset=['text', 'label'])
full_df = full_df[full_df['text'].str.strip() != ""].reset_index(drop=True)

# Clean known keyword leakage
KEYWORDS = [
    "credit union", "union", "payday", "loanmart", "loan mart", "lending", 
    "cashnet", "advance america", "quick cash", "title loan", "speedy cash",
    "tribal loan", "easy finance", "short-term loan", "bad credit loan",
    "instant cash", "get money", "fast loan", "borrow instantly"
]

def clean_keywords(text):
    for word in KEYWORDS:
        text = text.lower().replace(word, "")
    return text

full_df['text_clean'] = full_df['text'].apply(clean_keywords)
full_df = full_df.drop_duplicates(subset=["text_clean"])

# Balance the dataset across label
min_size = full_df['label'].value_counts().min()
full_df = full_df.groupby("label").sample(n=min_size, random_state=42).reset_index(drop=True)


In [94]:
# Save cleaned full_df
full_df.to_csv("../data/processed/full_dataset_final.csv", index=False)
full_df[['text', 'text_clean', 'label']].to_csv("../data/processed/full_dataset_with_cleaned.csv", index=False)


## 3: ML Modeling

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    full_df['text_clean'],
    full_df['label'],
    test_size=0.2,
    random_state=42,
    stratify=full_df['label']
)


In [None]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2), # Unigrams and bigrams
    lowercase=True,
    stop_words='english'
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

### Naive Bayes Classifier

In [None]:
# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)

# Evaluate
print("=== Naive Bayes Classification Report ===")
print(classification_report(y_test, y_pred_nb))


In [None]:
### function to plot confusion matrices for models
from sklearn.metrics import confusion_matrix
def plot_confusion(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred, labels=["predatory", "non_predatory"])
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["predatory", "non_predatory"], yticklabels=["predatory", "non_predatory"])
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

### Logistic Regression

In [None]:
## train log reg

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


In [None]:
# confusion matrix for Logistic Regression
plot_confusion(y_test, y_pred_lr, "Logistic Regression Confusion Matrix")

### Random Forest

In [None]:
##Train Random forests

rf = RandomForestClassifier()
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Confusion matrix For Random Forest
plot_confusion(y_test, y_pred_rf, "Random Forest Confusion Matrix")

In [None]:
##light Model tuning for RF

# Try a slightly smaller and slightly larger depth to see if better
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,       # Try 10 instead of default (None)
    min_samples_split=5,  # Avoid too much overfitting
    random_state=42
)
rf_model.fit(X_train_vec, y_train)
y_pred_rf_tuned = rf_model.predict(X_test_vec)

print("Tuned Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))



### Support Vector Machine (SVM)

### Random Forest Tuning with GridSearchCV

In [None]:

# Set up gridseachcv for Random Forest
from sklearn.model_selection import GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Set up the Grid Search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit
grid_search.fit(X_train_vec, y_train)

# Best model
best_rf_model = grid_search.best_estimator_

# Predict
y_pred_best_rf = best_rf_model.predict(X_test_vec)

# See best parameters
print("Best Random Forest params:", grid_search.best_params_)


In [None]:
###Best Random Forest params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

rf_model = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=42,
    max_features='log2'
)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
print("Tuned Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
svm = LinearSVC(max_iter=5000)
svm.fit(X_train_vec, y_train)
y_pred_svm = svm.predict(X_test_vec)
print("=== SVM classification report ===")
print(classification_report(y_test, y_pred_svm))


### Grid Search for SVM

In [None]:
# Set up a grid to search 
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
# Set up the Grid Search
svm = SVC(random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=1)
# Fit
grid_search.fit(X_train_vec, y_train)
# Best model    
best_svm_model = grid_search.best_estimator_
# Predict
y_pred_best_svm = best_svm_model.predict(X_test_vec)
# See best parameters
print("Best SVM params:", grid_search.best_params_)

In [None]:
##Best SVM params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
##Grid Search results for SVM:

svm_model = SVC(
    C=10,
    gamma='scale',
    kernel='linear',
    random_state=42
)
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
print("Tuned SVM Report:")
print(classification_report(y_test, y_pred_svm))




### Model Comparison

In [None]:
# Define a helper function for multiclass
def evaluate_model(name, y_true, y_pred, results=[]):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    results.append({
        'Model': name,
        'Precision': round(precision, 2),
        'Recall': round(recall, 2),
        'F1-Score': round(f1, 2)
    })
    return results

# Initialize results list
results = []

# Evaluate each model
results = evaluate_model("Logistic Regression", y_test, y_pred_lr, results)
results = evaluate_model("Naive Bayes", y_test, y_pred_nb, results)
results = evaluate_model("Random Forest", y_test, y_pred_rf, results)
results = evaluate_model("Random Forest (tuned)", y_test, y_pred_rf_tuned, results)
results = evaluate_model("Random Forest (grid search tuned)", y_test, y_pred_best_rf, results)
results = evaluate_model("SVM", y_test, y_pred_svm, results)
results = evaluate_model("SVM (grid search tuned)", y_test, y_pred_best_svm, results)



# Turn into DataFrame
results_df = pd.DataFrame(results)

# Display
results_df


In [None]:
from joblib import dump
dump(lr, "../best_model.joblib")
dump(tfidf, "../tfidf_vectorizer.joblib")


## BERT Modeling

In [None]:
#Tokenization
label_mapping = {'predatory': 1, 'non_predatory': 0}
full_df = full_df[full_df['label'].isin(label_mapping)].reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(
    full_df['text'],
    full_df['label'],
    test_size=0.2,
    random_state=42,
    stratify=full_df['label']
)

y_train_int = y_train.map(label_mapping)
y_test_int = y_test.map(label_mapping)

In [None]:
train_dataset = Dataset.from_dict({
    'text': X_train.tolist(),
    'label': y_train_int.tolist()
})
test_dataset = Dataset.from_dict({
    'text': X_test.tolist(),
    'label': y_test_int.tolist()
})

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids','attention_mask','label'])
test_dataset.set_format('torch', columns=['input_ids','attention_mask','label'])

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.config.problem_type = "single_label_classification"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.config.problem_type = "single_label_classification"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    # you can add: save_strategy='no' if you don't want checkpoints
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
# run training
trainer.train()

# still fine to call .predict afterwards…
preds = trainer.predict(test_dataset)
y_pred = torch.argmax(torch.tensor(preds.predictions), dim=1)


In [None]:
# Predict and Evaluate
preds = trainer.predict(test_dataset)
y_pred = torch.argmax(torch.tensor(preds.predictions), dim=1)

print("=== BERT Classification Report ===")
print(classification_report(y_test_int, y_pred))

In [None]:
# - Fine-tune BERT with better hyperparameters