## DistilBERT Model

In [None]:

!pip install --upgrade transformers datasets
!pip install --upgrade accelerate

In [None]:
print("Downloading 1.5GB complaints.csv.zip from source...")
!wget https://files.consumerfinance.gov/ccdb/complaints.csv.zip

# This unzips the file in the Colab environment
print("\nUnzipping file...")
!unzip -o complaints.csv.zip

print("\nFile is ready!")

# Loading 100k sample into pandas
import pandas as pd
filename = 'complaints.csv'
df = pd.read_csv(filename, nrows=100000)

print(f"Successfully loaded {len(df)} rows from the local CSV.")
print("DataFrame is ready. Here are the columns:")
print(df.columns.tolist())

In [None]:
# Importing libraries and packages

In [None]:
import pandas as pd
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

## Loading and Preprocessing the data

Importing packages

In [None]:
# Defining columns
feature_column = 'Consumer complaint narrative'
label_column = 'Product'

In [None]:
# Dropping rows where the narrative is missing
df.dropna(subset=[feature_column], inplace=True)
print(f"Rows remaining after dropping missing narratives: {len(df)}")


In [None]:

# Merge Duplicate Categories
print("Cleaning and merging duplicate categories...")
credit_categories = [
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Credit reporting or other personal consumer reports',
    'Credit reporting'
]
clean_name = 'Credit Reporting'
df[label_column] = df[label_column].replace(credit_categories, clean_name)
print("Category merging complete.")


In [None]:

# Define X and y
X = df[feature_column]
y = df[label_column]

In [None]:
# Remove Rare Classes (n < 2)
print("Removing rare classes...")
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()
if len(rare_classes) > 0:
    keep_indices = y.isin(rare_classes) == False
    X_filtered = X[keep_indices]
    y_filtered = y[keep_indices]
else:
    X_filtered = X
    y_filtered = y
print(f"Filtered data size: {len(y_filtered)}")

In [None]:
# Creating Label Dictionaries ---
print("Creating label dictionaries...")
labels = sorted(y_filtered.unique())
num_labels = len(labels)
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for i, label in enumerate(labels)}

In [None]:
# Creating final DataFrame
df_final = pd.DataFrame({
    'text': X_filtered,
    'label': y_filtered.map(label_to_id) # Map string labels to int labels
})
print(f"Data is ready. Number of labels: {num_labels}")

In [None]:
# Split the data
train_df, val_df = train_test_split(
    df_final,
    test_size=0.2,
    stratify=df_final['label'],
    random_state=42
)

In [None]:
# Converting to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
print(f"Training set size: {len(train_dataset)}, Validation set size: {len(val_dataset)}")

In [None]:
# Tokenizing the data
print("Loading tokenizer...")
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

In [None]:
print("Tokenizing training data...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
print("Tokenizing validation data...")
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Cleaning up datasets for model
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', '__index_level_0__'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['text', '__index_level_0__'])
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")
tokenized_train_dataset.set_format('torch')
tokenized_val_dataset.set_format('torch')

In [None]:
# Load Model
print("Loading pre-trained DistilBERT model...")
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to: {device}")

In [None]:
# Defining Metrics Function
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    macro_f1 = f1_score(labels, preds, average='macro') #
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'macro_f1': macro_f1}


In [None]:
# Training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# START TRAINING ---
print("\n*** STARTING MODEL FINE-TUNING ***")
trainer.train()
print("\n*** TRAINING COMPLETE ***")


In [None]:
# Final Evaluation
print("\nRunning final evaluation on the validation set:")
eval_results = trainer.evaluate()

print("\n--- FINAL MODEL RESULTS ---")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Macro F1-Score: {eval_results['eval_macro_f1']:.4f}")

Generating Transformer Confusion Matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

pred_output = trainer.predict(tokenized_val_dataset)

y_preds = np.argmax(pred_output.predictions, axis=1)

y_true = pred_output.label_ids

class_names = labels

cm = confusion_matrix(y_true, y_preds, labels=range(len(class_names)))

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',     # Format as integers
    cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names
)

plt.title('DistilBERT Model Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()

plt.savefig('distilbert_confusion_matrix.png')
print("Saved 'distilbert_confusion_matrix.png'")
plt.show()