In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)  # Should print CUDA version if installed
print(torch.backends.cudnn.version())  # Should print cuDNN version if available

2.6.0+cu118
11.8
90100


In [2]:
import pandas as pd 
import numpy as np 

In [3]:
df =  pd.read_csv("complaints_processed.csv")

In [4]:
df.shape

(162421, 3)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [6]:
# drop Unnamed: 0	
df.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
# checking for null values 

df.isnull().sum()

product       0
narrative    10
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

product      0
narrative    0
dtype: int64

In [10]:
df.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [11]:
df['product'].unique()

array(['credit_card', 'retail_banking', 'credit_reporting',
       'mortgages_and_loans', 'debt_collection'], dtype=object)

In [12]:
# label encoding
dic={'credit_card':0, 'retail_banking':1, 'credit_reporting':2,
       'mortgages_and_loans':3, 'debt_collection':4}

df['product']=df['product'].map(dic)

In [13]:
df.head()

Unnamed: 0,product,narrative
0,0,purchase order day shipping amount receive pro...
1,0,forwarded message date tue subject please inve...
2,1,forwarded message cc sent friday pdt subject f...
3,2,payment history missing credit report speciali...
4,2,payment history missing credit report made mis...


#### Import pre-trained bert model and tokenizer

In [14]:
from transformers import BertTokenizer,BertModel

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-uncased")


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Move the model to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [16]:
df.reset_index(drop=True, inplace=True)

In [17]:
# define class for tokenizer 
from torch.utils.data import DataLoader,Dataset
# Dataset class 
class NarrativeTokenizer(Dataset):
    def __init__(self,narrative,tokenizer,max_length=128): 
        self.narrative=narrative
        self.tokenizer=tokenizer
        self.max_length=max_length

    def __len__(self): 
        return len(self.narrative)
    
    def __getitem__(self,idx): 
        return self.tokenizer(self.narrative[idx], return_tensors="pt",
                              padding="max_length", truncation=True, max_length=self.max_length)
    

dataset = NarrativeTokenizer(df['narrative'],tokenizer)

dataloader = DataLoader(dataset,batch_size=16)

In [18]:
df['narrative'].replace('', 'default text', inplace=True)
# or
df = df[df['narrative'] != '']

In [19]:
import numpy as np

embeddings = []

# Disable gradient computation for inference
model.eval()

with torch.no_grad():
    for batch in dataloader:
        # Move inputs to the same device as the model (GPU)
        batch = {key: val.squeeze(1).to(device) for key, val in batch.items()}
        
        # Pass inputs through the model
        
        outputs = model(**batch)
        
        # Extract embeddings (using [CLS] token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU
        embeddings.extend(batch_embeddings)

# Add embeddings to the DataFrame
df['embeddings'] = embeddings


In [20]:
df.head()

Unnamed: 0,product,narrative,embeddings
0,0,purchase order day shipping amount receive pro...,"[-0.3200423, 0.08734484, 0.34736332, -0.168016..."
1,0,forwarded message date tue subject please inve...,"[-0.6380642, -0.28654295, 0.297329, -0.4522261..."
2,1,forwarded message cc sent friday pdt subject f...,"[-0.22500874, 0.012715315, 0.32256964, -0.0310..."
3,2,payment history missing credit report speciali...,"[-0.4295894, 0.0020560075, 0.29063782, 0.02889..."
4,2,payment history missing credit report made mis...,"[-0.40051457, -0.04720374, 0.3525641, -0.13370..."


In [21]:
len(df['embeddings'][0])   # length of embedding 

768

In [22]:
df.drop(columns=["narrative"],inplace=True)

In [23]:
df.head()

Unnamed: 0,product,embeddings
0,0,"[-0.3200423, 0.08734484, 0.34736332, -0.168016..."
1,0,"[-0.6380642, -0.28654295, 0.297329, -0.4522261..."
2,1,"[-0.22500874, 0.012715315, 0.32256964, -0.0310..."
3,2,"[-0.4295894, 0.0020560075, 0.29063782, 0.02889..."
4,2,"[-0.40051457, -0.04720374, 0.3525641, -0.13370..."


In [24]:
# Convert embeddings to PyTorch tensors
X = torch.tensor(np.array(df['embeddings'].tolist()), dtype=torch.float32)
y = torch.tensor(df['product'].values, dtype=torch.long)  # Assuming labels are integers

In [25]:
type(X)

torch.Tensor

#### train test split

In [26]:
from sklearn.model_selection import  train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=10)

In [27]:
from torch.utils.data import TensorDataset

# Create DataLoader for batching
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [28]:
from torch import nn 

class FeedForwordNN(nn.Module): 
    def __init__(self, input_dim, output_dim):
        super(FeedForwordNN, self).__init__()
        self.fc1 = nn.Linear(input_dim,128)
        self.fc2 = nn.Linear(128,64)
        self.fc3 = nn.Linear(64,output_dim)
        self.relu = nn.ReLU()

    def forward(self,X):
        X = self.fc1(X)
        X = self.relu(X)
        X = self.fc2(X)
        X = self.relu(X)
        X = self.fc3(X) 

        return X
    
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))

model = FeedForwordNN(input_dim,output_dim)

In [29]:
from torchsummary import summary
model = model.to("cuda")  # Move model to GPU
summary(model, input_size=(input_dim,), device="cuda")


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 128]          98,432
              ReLU-2                  [-1, 128]               0
            Linear-3                   [-1, 64]           8,256
              ReLU-4                   [-1, 64]               0
            Linear-5                    [-1, 5]             325
Total params: 107,013
Trainable params: 107,013
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.41
Estimated Total Size (MB): 0.41
----------------------------------------------------------------


In [30]:
import torch.optim as optim

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [32]:
print(y_train.dtype, y_train.min().item(), y_train.max().item())


torch.int64 0 4


In [33]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Number of epochs
epochs = 50

# Training loop
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for batch in train_loader:
        inputs, labels = batch
        
        # Move inputs and labels to GPU
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader)}")


Epoch [1/50], Loss: 0.6625241410501835
Epoch [2/50], Loss: 0.570153187491629
Epoch [3/50], Loss: 0.5420786424007242
Epoch [4/50], Loss: 0.5215787058432417
Epoch [5/50], Loss: 0.5094563636403164
Epoch [6/50], Loss: 0.49668914810046866
Epoch [7/50], Loss: 0.4855131231187835
Epoch [8/50], Loss: 0.47691531626682393
Epoch [9/50], Loss: 0.4680745278917871
Epoch [10/50], Loss: 0.46042731345228244
Epoch [11/50], Loss: 0.45386602885505595
Epoch [12/50], Loss: 0.4473682198103823
Epoch [13/50], Loss: 0.43935049860216485
Epoch [14/50], Loss: 0.4353185132699951
Epoch [15/50], Loss: 0.4286865282623059
Epoch [16/50], Loss: 0.4235330856813641
Epoch [17/50], Loss: 0.4185958916155818
Epoch [18/50], Loss: 0.413280573709093
Epoch [19/50], Loss: 0.40855085812885167
Epoch [20/50], Loss: 0.4032073310243617
Epoch [21/50], Loss: 0.4009279520258884
Epoch [22/50], Loss: 0.3956627521963872
Epoch [23/50], Loss: 0.39121203493474593
Epoch [24/50], Loss: 0.38817927445707223
Epoch [25/50], Loss: 0.38360921280837884
Ep

In [34]:
correct = 0
total = 0
model.eval()  # Set model to evaluation mode

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total * 100
print(f"Test Accuracy: {accuracy:.2f}%")


Test Accuracy: 82.01%


In [35]:
# save model 
torch.save(model.state_dict(), "AIComplaintHub.pth")

In [36]:
bert_model = BertModel.from_pretrained("google-bert/bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [38]:
# Load the trained model
model.load_state_dict(torch.load("AIComplaintHub.pth"))
model.eval()

# Function to classify a new complaint
def classify_new_complaint(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        bert_outputs = bert_model(**inputs)
        embedding = bert_outputs.last_hidden_state[:, 0, :]  # [CLS] embedding
        embedding = embedding.to(device)
        output = model(embedding)
        prediction = torch.argmax(output, dim=1).item()
    categories = ['credit_card', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection']
    return categories[prediction]

# Testing 
complaint = "current loan provident funding applied refinance provident funding variable interest rate went loan applied refinance downpayment reduce depth also monthly payment locked interest rate day current payment going approximately provided income debt information provident funding provident funding play game interest rate arm went provided every information suspended loan application able reach mortgage broker always get voicemail responded email underwriter approving due ratio"
category = classify_new_complaint(complaint)
print(f"Predicted category: {category}")  

Predicted category: mortgages_and_loans
