In [None]:
import pandas as pd
import numpy as np

## Bank Complaints Data

In [None]:
!wget https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
!unzip -o complaints_v2.zip
complaints_data = pd.read_csv("/content/complaints_v2.csv")
complaints_data.head()

--2025-04-02 15:20:43--  https://github.com/venkatareddykonasani/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip [following]
--2025-04-02 15:20:43--  https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Bank_Customer_Complaints/complaints_v2.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20228857 (19M) [application/zip]
Saving to: ‘complaints_v2.zip.1’


2025-04-02 15:20:44 (79.9 MB/s) - ‘complaints_v2.zip.1’ saved [20228857/2022885

Unnamed: 0,ID,product,text,label
0,0,credit_card,purchase order day shipping amount receive pro...,1
1,1,credit_card,forwarded message date tue subject please inve...,1
2,2,retail_banking,forwarded message cc sent friday pdt subject f...,1
3,3,credit_reporting,payment history missing credit report speciali...,0
4,4,credit_reporting,payment history missing credit report made mis...,0


## Use distilbert model without finetunung

In [None]:
# Distil bert model
from transformers import pipeline
distilbert_model = pipeline(task="text-classification",
                            model="distilbert-base-uncased",
                            )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [None]:
sample_data = complaints_data.sample(100, random_state=42)

sample_data["text"] = sample_data["text"].apply(lambda x: " ".join(x.split()[:350]))

sample_data["bert_predicted"] = sample_data["text"].apply(lambda x: distilbert_model(x)[0]["label"])

#Default prediction is not a number LABEL_1, LABEL_0
sample_data["bert_predicted_num"]=sample_data["bert_predicted"].apply(lambda x: x[-1])

sample_data["bert_predicted_num"] = sample_data["bert_predicted_num"].astype(int)

sample_data.head()

Unnamed: 0,ID,product,text,label,bert_predicted,bert_predicted_num
156566,156566,mortgages_and_loans,penfed asking copy driver license finalizing l...,1,LABEL_1,1
1498,1498,credit_reporting,collection account removed credit report frank...,0,LABEL_0,0
134991,134991,credit_reporting,bureau falsely reporting alleged debt fdcpa se...,0,LABEL_0,0
56391,56391,mortgages_and_loans,va mortgage well fargo bank since meet conditi...,1,LABEL_1,1
9067,9067,credit_reporting,bank xxxxi credit card mine,0,LABEL_0,0


### Accuracy of the model without fine-tuning

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(sample_data["label"], sample_data["bert_predicted_num"])

print(cm)

accuracy=cm.diagonal().sum()/cm.sum()

print(accuracy)

[[26 21]
 [33 20]]
0.46


In [None]:
## the above it has lo accuracy, we finetune the model with our data and do it

# Project - Finetuning the model with our data

In [None]:
!pip -q install accelerate -U
!pip -q install transformers[torch]
!pip -q install datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, ClassLabel, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [None]:
#The target variable must be named as "label" - Verify it, before proceeding
print(sample_data.columns)

Index(['ID', 'product', 'text', 'label', 'bert_predicted',
       'bert_predicted_num'],
      dtype='object')


In [None]:
Sample_data = Dataset.from_pandas(sample_data) #Taken sample data

# Split the dataset into training and testing sets
train_test_split = Sample_data.train_test_split(test_size=0.2)  # 80% training, 20% testing
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'product', 'text', 'label', 'bert_predicted', 'bert_predicted_num', '__index_level_0__'],
        num_rows: 80
    })
    test: Dataset({
        features: ['ID', 'product', 'text', 'label', 'bert_predicted', 'bert_predicted_num', '__index_level_0__'],
        num_rows: 20
    })
})

##Load the tokenizer

In [13]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Padding--> 256 words, if comment is less than 256, it will pad, if it more it will cut down
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'} )

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Load and Train the model

In [14]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels=2,#binary classification we are doing now
                                                            pad_token_id=tokenizer.eos_token_id) # Adjust num_labels as needed
model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

### Trainer
- Trainer is a complete training and evaluation loop for Transformers’ PyTorch models. Plug a model, preprocessor, dataset, and training arguments into Trainer and let it handle the rest to start training faster.

- Trainer is also powered by Accelerate, a library for handling large models for distributed training.

In [17]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [18]:
training_args = TrainingArguments(
    output_dir="./results_bert_custom",#where to save
    num_train_epochs=1,
    logging_dir="./logs_bert_custom",
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Start training
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,0.683712


TrainOutput(global_step=10, training_loss=0.6929358005523681, metrics={'train_runtime': 395.0314, 'train_samples_per_second': 0.203, 'train_steps_per_second': 0.025, 'total_flos': 10597391892480.0, 'train_loss': 0.6929358005523681, 'epoch': 1.0})

In [19]:
# Define the directory where you want to save your model and tokenizer
model_dir = "./distilbert_finetuned"

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

#Save the model with
trainer.save_model('Distilbert_CustomModel_10K')

In [20]:
# Define the directory where you want to save your model and tokenizer
model_dir = "./distilbert_finetuned"

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

#Save the model with
trainer.save_model('Distilbert_CustomModel_10K')