In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, AdamW, BertModel, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

In [None]:
#connect to the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load the Dataset

In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/AI/consumer_complaints.csv")

#total data in consumer_complaint_narrative
df['consumer_complaint_narrative'].isna().sum()


  df = pd.read_csv("/content/drive/MyDrive/AI/consumer_complaints.csv")


489151

In [None]:
df = df[["consumer_complaint_narrative", "product"]].dropna()

In [None]:
#total number of rows in consumer_complaint_narrative
df['consumer_complaint_narrative'].notnull().sum()

66806

In [None]:
df.head()

Unnamed: 0,consumer_complaint_narrative,product
190126,XXXX has claimed I owe them {$27.00} for XXXX ...,Debt collection
190135,Due to inconsistencies in the amount owed that...,Consumer Loan
190155,In XX/XX/XXXX my wages that I earned at my job...,Mortgage
190207,I have an open and current mortgage with Chase...,Mortgage
190208,XXXX was submitted XX/XX/XXXX. At the time I s...,Mortgage


In [None]:
df.shape


(66806, 2)

In [None]:

df = df.sample(n=5000, random_state=42)
df.head()

Unnamed: 0,consumer_complaint_narrative,product
516673,I have been battling with portfolio recovery a...,Debt collection
516092,In the fall of XXXX I applied for a mortgage m...,Mortgage
236006,i tried to call this number and i can not call...,Credit card
316478,Trans union is showing a old debt against me o...,Credit reporting
241895,The letter was mailed out on XXXX XXXX. The Ex...,Credit reporting


In [None]:
df['product'].value_counts()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
Debt collection,1314
Mortgage,1080
Credit reporting,935
Credit card,604
Bank account or service,439
Consumer Loan,271
Student loan,168
Prepaid card,73
Payday loan,60
Money transfers,46


In [None]:
#remove other financial services
df = df[df['product'] != 'Other financial service']

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df['product'] = label_encoder.fit_transform(df['product'])


In [None]:
import re

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(preprocess_text)


In [None]:
df.iloc[5]['consumer_complaint_narrative']

'we have been a customer of capital one for many years my xxxx mother has been responsible for paying certain bills to allow her to remain financially independent however her memory has been an issue of late and i noticed that the capital one bill was not paid timely as soon as i saw this i paid the amount due of xxxx xxxxxxxx i reviewed to see why the payment was so high as we do not use the card much and the minimum payments are normally xxxx due to the missed payment for xxxx and xxxx capital one increased our interest rate from to effectively increasing the minimum payment from xxxx to xxxx more than doubled the minimum payment i called capital one and spoke to several people who told me that per the terms and conditions they were unable to lower the rate on the card for xxxx months however it will be impossible for us to pay the minimum payment of xxxx essentially we will go delinquent due to this drastic rate increase up until this small issue we have always paid on time the fact

In [None]:
def preprocess_text(text):
    # Remove more than one x
    text = re.sub(r'x{2,}', 'x', text)
    return text


In [None]:
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(preprocess_text)


In [None]:
df.iloc[5]['consumer_complaint_narrative']

'we have been a customer of capital one for many years my x mother has been responsible for paying certain bills to allow her to remain financially independent however her memory has been an issue of late and i noticed that the capital one bill was not paid timely as soon as i saw this i paid the amount due of x x i reviewed to see why the payment was so high as we do not use the card much and the minimum payments are normally x due to the missed payment for x and x capital one increased our interest rate from to effectively increasing the minimum payment from x to x more than doubled the minimum payment i called capital one and spoke to several people who told me that per the terms and conditions they were unable to lower the rate on the card for x months however it will be impossible for us to pay the minimum payment of x essentially we will go delinquent due to this drastic rate increase up until this small issue we have always paid on time the fact that my mothers memory contribute

In [None]:

!pip install transformers



In [None]:
#creating an array of the label and text values
labels = df['product'].values
texts = df['consumer_complaint_narrative'].values

print(texts[0])
print(labels[:10])

i have been battling with portfolio recovery and foster garbus garbus for over a year regarding a debt that is not mine i continue receiving letters from foster garbus x garbus regarding same debt although i ve submitted documents to foster garbus garbus proving that i do not owe said debt these guys went as far as having my x x account frozen last year and i thought the issue was resolved once i submitted my documents
[4 6 2 3 3 2 3 4 2 3]


In [None]:
#split the data into 70:30 ratio

from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)

#Initialize the BERT Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #uncased will convert the text into lower case and then apply the tokenization
max_length = 256

#tokenize the text values passed as input
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=max_length)
#truncation = True means if the number of tokens exceed the max length then it will drop other tokens
#padding = True means if the number of tokens are less than the max length then it will do the padding




In [None]:
print(train_encodings.keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
print(train_encodings['input_ids'][1])
print(train_encodings['attention_mask'][20]) #everything will be one because it is an encoder as every token can see the other tokens
print(train_encodings['token_type_ids'][0])


[101, 2023, 2003, 2013, 18178, 6299, 2239, 9425, 1045, 2079, 23961, 12533, 2068, 2505, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
decoded_token = tokenizer.decode([1999])
print(decoded_token)

#101 will be cls and 102 will be sep
#100 will be unkown for those which are not present in vocab
#105 will be unused for new vocab

in


In [None]:
#unique tokens can be obtained using input_ids
#set to get the unique tokens
unique_tokens = set([token_id for sublist in train_encodings['input_ids'] for token_id in sublist])

#get the total
num_unique_tokens = len(unique_tokens)
print(num_unique_tokens)

#present in train_encoding is input_ids, attention_mask, token_types_id = 3
train_encodings_unique = set(train_encodings)
print(len(train_encodings))

8903
3


#Define a Pytorch dataset wrapper

In [None]:
class MyDataset(Dataset):
    def __init__(self, encodings, labels): #constructor
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx): #it defines how to retrieve a single data item at a specific index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self): #this method simply returns the total number of data points in the dataset.
        return len(self.labels)


#Create a Custom Dataset and Dataloader

In [None]:
#Create the dataset objects

train_dataset = MyDataset(train_encodings, train_labels)
test_dataset = MyDataset(test_encodings, test_labels)



In [None]:
train_dataset[0]


{'input_ids': tensor([  101, 12087, 25022,  3775, 21270,  9361,  4070,  2326, 16039,  2026,
          5356,  6781,  2006,  1060,  1060,  1045,  2330,  1037, 25022,  3775,
          9299,  1055, 25022,  3775, 21270,  9361,  2000,  7796,  5356,  6781,
          2007,  4712,  3642,  1060,  2083,  1060,  1996,  4712,  5942,  2330,
          1037,  2047,  7325,  9361,  4070,  1999,  1996, 25022,  3775, 21270,
          4070,  7427,  2011,  1060,  2191,  2019,  7792, 12816,  1997,  2030,
          2062,  1999, 25597, 10085, 25090,  9299,  1054,  5029,  2046,  2037,
          2047,  9361,  2030,  2047,  2030,  4493, 10995,  4606,  4070,  2306,
          2420,  2044,  4070,  3098,  5441,  1037,  6263,  1997,  2005,  1996,
          2279,  2420,  2000,  4374,  2037,  6781,  1045,  2363,  1996,  6160,
          7427,  2006,  1060,  1060,  1998,  5653,  2098,  1996,  2772,  8085,
          4003,  2067,  2000,  2068,  2006,  1060,  1060,  2006,  1060,  1060,
          1045,  2081,  1037, 12816,  1

In [None]:
#Create the dataloaders for training and evaluation
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


#Training and Preparation

In [None]:
epochs = 3
#take unique product labels
num_classes = len(df['product'].unique())
print(num_classes)

10


In [None]:
#create a bert model

class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)  # Transform CLS embedding to class logits

#passing the CLS embedding to the FFN
#why we are doing this? because the CLS embedding will be consisting the information of all the input tokens which will be further used to calculate loss
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS Token
        x = self.dropout(cls_embedding)
        logits = self.fc(x)  # shape: (batch_size, num_classes)
        return logits


In [None]:
#connect to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#Training Process

In [None]:
criterion = nn.CrossEntropyLoss()
model = BERTClassifier(num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
#optimizer: this is your PyTorch optimizer (e.g. AdamW) that will be used to update the model's parameters.



In [None]:
#training process
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader: #every batch will have 4 keys input_ids, attention mask, labels, and type_ids
        optimizer.zero_grad()

        #move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        #forward process (bert returns loss when labels are provided)
        outputs = model(input_ids, attention_mask=attention_mask)

        loss = criterion(outputs, labels) # Pass outputs to the loss function
        total_loss += loss.item()


        #backward pass and optimization step
        loss.backward() #call backward method to calculate the gradients.
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {avg_train_loss:.4f}")

Epoch 1/3 - Average Training Loss: 1.1544
Epoch 2/3 - Average Training Loss: 0.5719
Epoch 3/3 - Average Training Loss: 0.3808


#Evaluation

In [None]:
# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        # Get probabilities instead of predicted class labels
        probs = torch.softmax(outputs, dim=1)

        predictions.extend(probs.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, [p.argmax() for p in predictions]) #argmax to get predicted class from probabilities
#'ovr' is for multi-class AUC calculation
auc = roc_auc_score(true_labels, predictions, multi_class='ovr')

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

Accuracy: 0.8136
AUC: 0.9621
