In [2]:
!pip install -q transformers datasets evaluate torch scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import os

In [18]:
import pandas as pd

# ===============================
# 1. Load Dataset
# ===============================
OUTPUT_DIR = '/content/drive/MyDrive/model_results/consumer_complaints_sampled.csv'
df = pd.read_csv(OUTPUT_DIR)

# ===============================
# 2. Check Label Distribution
# ===============================
print("Original Label Distribution:")
print(df['Label'].value_counts())

# ===============================
# 3. Downsample to 10,000 per Label
# ===============================
# Set the desired sample size per label
SAMPLE_SIZE = 10000

# Group by label and sample
df_balanced = (
    df.groupby('Label', group_keys=False)
      .apply(lambda x: x.sample(n=SAMPLE_SIZE, random_state=42))
      .reset_index(drop=True)
)

# ===============================
# 4. Verify the New Distribution
# ===============================
print("\nAfter Downsampling:")
print(df_balanced['Label'].value_counts())

print("\n✅ New dataset shape:", df_balanced.shape)

# ===============================
# 5. (Optional) Save the Reduced Dataset
# ===============================
OUTPUT_BALANCED = '/content/drive/MyDrive/model_results/consumer_complaints_balanced_10k.csv'
df_balanced.to_csv(OUTPUT_BALANCED, index=False)
print(f"\n📁 Balanced dataset saved to: {OUTPUT_BALANCED}")


Original Label Distribution:
Label
2    50000
1    50000
3    50000
0    50000
Name: count, dtype: int64

After Downsampling:
Label
0    10000
1    10000
2    10000
3    10000
Name: count, dtype: int64

✅ New dataset shape: (40000, 3)


  .apply(lambda x: x.sample(n=SAMPLE_SIZE, random_state=42))



📁 Balanced dataset saved to: /content/drive/MyDrive/model_results/consumer_complaints_balanced_10k.csv


In [4]:
import os
import pandas as pd
OUTPUT_DIR = '/content/drive/MyDrive/model_results/consumer_complaints_balanced_10k.csv'
df=pd.read_csv(OUTPUT_DIR)
df.head()

Unnamed: 0,Consumer complaint narrative,Product,Label
0,"Dear Consumer Protection Bureau, I am writing ...",Credit reporting or other personal consumer re...,0
1,"When I reviewed my credit report, I discovered...","Credit reporting, credit repair services, or o...",0
2,I discovered that some of the information on m...,"Credit reporting, credit repair services, or o...",0
3,In accordance with the Fair Credit Reporting a...,Credit reporting or other personal consumer re...,0
4,Requesting the removal of many unlawful hard c...,Credit reporting or other personal consumer re...,0


In [5]:
df = df.rename(columns={'Consumer complaint narrative': 'text', 'Label': 'label'})
df['label'] = df['label'].astype(int)
print("Data shape:", df.shape)
df.head()

Data shape: (40000, 3)


Unnamed: 0,text,Product,label
0,"Dear Consumer Protection Bureau, I am writing ...",Credit reporting or other personal consumer re...,0
1,"When I reviewed my credit report, I discovered...","Credit reporting, credit repair services, or o...",0
2,I discovered that some of the information on m...,"Credit reporting, credit repair services, or o...",0
3,In accordance with the Fair Credit Reporting a...,Credit reporting or other personal consumer re...,0
4,Requesting the removal of many unlawful hard c...,Credit reporting or other personal consumer re...,0


In [6]:
# ===============================
# 3. Train-Test Split
# ===============================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [8]:
num_labels = len(df['label'].unique())
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="./bert_consumer_complaints",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshashankk[0m ([33mshashankk-amrita-vishwa-vidyapeetham[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3264,0.346649,0.8875,0.887521
2,0.2371,0.332458,0.9015,0.900717
3,0.1813,0.373059,0.903375,0.903142
4,0.1439,0.455306,0.902875,0.902555
5,0.0764,0.485815,0.905,0.904674


TrainOutput(global_step=10000, training_loss=0.21780054125785828, metrics={'train_runtime': 3821.9134, 'train_samples_per_second': 41.864, 'train_steps_per_second': 2.616, 'total_flos': 1.052463120384e+16, 'train_loss': 0.21780054125785828, 'epoch': 5.0})

In [13]:
eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:")
print(eval_results)


📊 Evaluation Results:
{'eval_loss': 0.3324583172798157, 'eval_accuracy': 0.9015, 'eval_f1': 0.9007168131694209, 'eval_runtime': 57.3359, 'eval_samples_per_second': 139.529, 'eval_steps_per_second': 8.721, 'epoch': 5.0}


In [14]:
SAVE_PATH = "/content/drive/MyDrive/model_results/bert_consumer_complaints"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

('/content/drive/MyDrive/model_results/bert_consumer_complaints/tokenizer_config.json',
 '/content/drive/MyDrive/model_results/bert_consumer_complaints/special_tokens_map.json',
 '/content/drive/MyDrive/model_results/bert_consumer_complaints/vocab.txt',
 '/content/drive/MyDrive/model_results/bert_consumer_complaints/added_tokens.json',
 '/content/drive/MyDrive/model_results/bert_consumer_complaints/tokenizer.json')

In [16]:
# Load the trained model
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)

# Create a text classification pipeline
text_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# Example: Predict on new complaint text
sample_text = "I have been charged extra interest on my credit card even after paying on time."
predictions = text_classifier(sample_text)[0]

# Get the most likely label
pred_label = np.argmax([p['score'] for p in predictions])
print("\n🔮 Prediction Results:")
for p in predictions:
    print(f"Label {p['label']} → Score: {p['score']:.4f}")

print(f"\n✅ Predicted Label: {pred_label}")

Device set to use cuda:0



🔮 Prediction Results:
Label LABEL_0 → Score: 0.0019
Label LABEL_1 → Score: 0.0018
Label LABEL_2 → Score: 0.9940
Label LABEL_3 → Score: 0.0024

✅ Predicted Label: 2


